Python get_soup Exemples, helpers.get_soup Python Exemples

Exemple #1

0

Afficher le fichier

    def get_course_outline(self) -> str:
        soup = get_soup(self.base_url)
        course_summary = next(
            (x for x in soup.find_all(["h1", "h2", "h3"])
             if x.get_text(strip=True) == "Course Summary"),
            None,
        )

        outline = ""
        if course_summary is not None:
            # This course uses webcms
            orig = course_summary.name
            course_summary = course_summary.next_sibling
            while course_summary.name != orig:
                if not isinstance(course_summary, NavigableString):
                    outline += course_summary.get_text(strip=True)
                course_summary = course_summary.next_sibling
        else:
            # Check for iframe
            external = soup.find_all(text=re.compile("View in browser"))
            if external:
                self.base_url = external[0].parent["href"]
                # for now, just record the url
                return ""
            else:
                return ""

        return outline.strip()

Exemple #2

0

Afficher le fichier

Fichier : image.py Projet : Bob-78/Twitter-bot-cats

    def get_random_image_url(page_url):

        print("...function get_random_image_url called")

        #get the soup
        soup = helpers.get_soup(page_url)

        # make list of all the image urls
        links = [
            element['data-lazy-srcset']
            for element in soup.findAll('img',
                                        attrs={'data-lazy-srcset': True})
        ]

        # select a random url from the list of links
        image_url = links[helpers.random_list_number(links)]

        # do some cleanup
        image_url = image_url.split(
            ','
        )[1]  # split list elements, keep only the second one ([1]), the 480p one
        image_url = image_url.split(
            ' '
        )[1]  # split list elements, keep only the first one ([0]), drop the 2x
        image_url = image_url.replace("_480",
                                      "1280")  # get the larger size images

        print("...Using this url: {}".format(image_url))

        return image_url

Exemple #3

0

Afficher le fichier

Fichier : program.py Projet : wyu17/uq_scraper

  def update(self):
    """Update self based on information scraped from UQ
    """
    if ("acad_prog" not in self.code):
      print("not a program")
      return
    base_url = 'https://my.uq.edu.au{}'.format(str(self.code))
    soup = get_soup(base_url)

    self.title = soup.find(id="program-title").get_text()
    self.level = soup.find(id="program-title").get_text().split(' ')[0].lower()
    self.units = int(soup.find(id="program-domestic-units").get_text())
    self.code = int(self.code[-4:])

Exemple #4

0

Afficher le fichier

Fichier : application.py Projet : mariahdim/CS50_Final_Project

def home():

    # we want do display the menu for the upcoming week:
    span = 7
    # Initialize empty vectors
    dates = [None] * span
    nice_dates = []
    soups = [None] * span
    lunches = [None] * span
    dinners = [None] * span
    dessert_lunches = [None] * span
    dessert_dins = [None] * span
    # Textual month, day and year (for display in jumbotron)
    d8 = datetime.today()
    d = d8.strftime("%b. %d, %Y")

    for i in range(span):
        x = d8 + timedelta(days=i)
        date = x.date()
        dates.append(date)
        nice_dates.append(date.strftime('%B %d'))

        ## SOUP OF THE DAY
        soup = get_soup(date)
        soups[i] = unique(soup)
        # soup = pd.DataFrame(unique(soup))
        # soups[i] = soup.to_html()

        ## ENTREES
        lunch = get_lunch(date)
        lunches[i] = unique(lunch)

        dinner = get_dinner(date)
        dinners[i] = unique(dinner)

        # DESSERT
        dessert_lunch = get_dessert(date, meal=1)
        dessert_lunches[i] = unique(dessert_lunch)

        dessert_din = get_dessert(date, meal=2)
        dessert_dins[i] = unique(dessert_din)

    return render_template("home.html",
                           dates=dates,
                           nice_dates=nice_dates,
                           soups=soups,
                           lunches=lunches,
                           dinners=dinners,
                           dessert_lunches=dessert_lunches,
                           dessert_dins=dessert_dins,
                           d=d)

Exemple #5

0

Afficher le fichier

    def update(self, linkCode: str):
        """Updates self based on information scraped from UQ
    """
        base_url = 'https://my.uq.edu.au{}'.format(linkCode)
        soup = get_soup(base_url)

        self.title = soup.find(id="page-head").find("h1").get_text()
        self.code = linkCode[-10:]
        # Treat specialisations as extended majors
        if ("Extended Major" in self.title or "Specialisation" in self.title):
            self.type = "eMajor"
        elif ("Minor" in self.title):
            self.type = "minor"
        else:
            self.type = "major"

Exemple #6

0

Afficher le fichier

Fichier : image.py Projet : Bob-78/Twitter-bot-cats

    def get_total_pages(base_url):

        print("...function total_pages called")

        # get some soup for our url
        soup = helpers.get_soup(base_url + "1")

        total_pages = soup.find(
            'form', {
                'class': 'add_search_params pure-form hide-xs hide-sm hide-md'
            }).getText()
        total_pages = [int(s) for s in total_pages.split() if s.isdigit()][0]

        print("...returning total pages: {}".format(total_pages))

        return total_pages

Exemple #7

0

Afficher le fichier

Fichier : scrape_champions.py Projet : delafields/delafields.github.io

def get_champs_table(league):
        print(f"Workin on {league}")

        url = championship_urls[league][0]
        selector = championship_urls[league][1]

        soup = get_soup(url)

        if league == "La Liga":
                champ_table = soup.find_all("table", {"class": selector})[-1]
        else:
                champ_table = soup.select_one(selector)
                
                
        champ_df = pd.read_html(str(champ_table))[0]
        champ_df["League"] = league

        league = league.replace(" ", "")

        return champ_df

Exemple #8

0

Afficher le fichier

Fichier : course.py Projet : wyu17/uq_scraper

    def update(self):
        """Updates self based on information scraped from UQ
    """
        base_url = 'http://www.uq.edu.au/study/course.html?course_code={}'.format(
            self.code)
        soup = get_soup(base_url)

        if soup is None or soup.find(id="course-notfound"):
            return None

        description = soup.find(id="course-summary").get_text().replace(
            '"', '').replace("'", "''")
        # apparent edge case; see STAT2203
        if '\n' in description:
            description = description.split('\n')[0]
        self.description = description
        self.title = soup.find(id="course-title").get_text()[:-11].replace(
            "'", "''")
        self.units = int(soup.find(id="course-units").get_text())

        semester_offerings = str(soup.find_all(id="course-current-offerings"))
        if "Semester 1, " in semester_offerings:
            self.sem1 = 1
        if "Semester 2, " in semester_offerings:
            self.sem2 = 1
        if "Summer Semester, " in semester_offerings:
            self.summer = 1

        prereq = soup.find(id="course-prerequisite")
        if prereq is not None:
            if (type(prereq) != type("")):
                prereq = prereq.get_text()
            self.prereq = prereq

        incomp = soup.find(id="course-incompatible")
        if incomp is not None:
            incomp = incomp.get_text()
            self.incomp = incomp

Exemple #9

0

Afficher le fichier

def is_webcms3(course_code, offering_term):
    url = WEBCMS_URL.format(course_code, offering_term)
    soup = get_soup(url)
    return url if soup.find("h2",
                            string="The page was not found.") is None else ""

Exemple #10

0

Afficher le fichier

    courses = json.load(courses_file)

# if len(sys.argv) > 1:
#     courses = [c for c in courses if c["code"] == sys.argv[1]]
# with open("src/course_scraper/course_host.json") as course_host_file:
#     course_hosts = json.load(course_host_file)
keywords = ["course objectives", "course summary", "course aims", "aims"]
headers = ["h1", "h2", "h3"]
for course in courses:
    if (("outline" in course and course["outline"]) or not course["url"]
            or "private" in course):
        continue

    course_code = course["code"]
    url = course["url"]
    soup = get_soup(url)
    # print(soup.find_all(["h1", "h2", "h3"])[3].get_text(strip=True).lower() in keywords)
    course_summary = next(
        (x for x in soup.find_all(headers)
         if x.get_text(strip=True).lower() in keywords),
        None,
    )
    print(course_summary)
    outline = ""
    if course_summary is not None:
        orig = course_summary.name
        course_summary = course_summary.next_sibling
        while course_summary is not None and (
                course_summary.name not in headers
                or not course_summary.get_text(strip=True)):
            if not isinstance(course_summary, NavigableString):