Ejemplos de get_soup en Python

Lenguaje de programación: Python

Namespace/Package Name: soup

Método / Función: get_soup

Ejemplos en hotexamples.com: 7

Python get_soup - 7 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de soup.get_soup extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

def get_divisions():
    """Scrape the list of divisions and major groups from the OSHA website
    Divisions are the broadest grouping of SIC codes provided by OSHA
    Major groups are the second broadest grouping of SIC codes provided by OSHA
    """

    # Read site
    soup = get_soup(config.OSHA_base_url + 'sic_manual.html')

    # Find content
    container = soup.select('div#maincontain')[0]
    master_list = container.find('div').find('ol')
    all_links = master_list.find_all('a')

    # Store cleaned descriptions, from aref elements
    divisions = []
    for i in range(0, len(all_links)):

        # Store full desciption provided by site and keep the associated link
        l = all_links[i]
        full_desc = str(l.contents[0]).strip().encode("utf-8")
        link = l.get('href').encode("utf-8")

        # Get the description of the parent group
        if (i > 0) & (clean_desc(full_desc)[1] == 'Major Group'):
            parent_desc = get_parent(divisions, i, 'Major Group', 'Division')
        else:
            parent_desc = str(None)

        # Add to running list of named tuples
        divisions.append(ind_group(full_desc, parent_desc, link))

    return divisions

Ejemplo n.º 2

Mostrar archivo

Archivo: main.py Proyecto: letianccc/scratch

def explore_book(allbook, url, conn, cursor):
    sleep(1)
    soup = get_soup(url)
    book = get_book(soup, url)
    print(book.name, book.score, book.url)
    insert_explored_book(book, conn, cursor)

    if is_target_book(allbook, book):
        update_allbook(allbook, book, conn, cursor)

    urls = urls_for_more_books(soup)
    for url in urls:
        if not_explored(url, cursor):
            explore_book(allbook, url, conn, cursor)

Ejemplo n.º 3

Mostrar archivo

def get_major(url_ext):
    """Scrape the list of major groups, industry groups and SIC four-digit SIC codes
    from the OSHA website
    Major groups are the second broadest grouping of SIC codes provided by OSHA
    Industry groups are the third broadest grouping (least granular) of SIC
    codes provided by OSHA
    """

    # Read site
    soup = get_soup(config.OSHA_base_url + url_ext)

    # Isolate relevant content
    container = soup.select('div#maincontain')[0]
    groups = container.find_all(['strong', 'li'])
    major_desc = str(container.find_all('h2')[0].contents[0])

    # Store cleaned descriptions, from strong and li elements
    majors = []
    for i in range(0, len(groups)):

        g = groups[i]

        # Get description of SIC and industry groups
        if g.name == 'strong':
            # Get industry group descriptions
            full_desc = g.contents[0].strip().encode("utf-8")
            link = None
        elif g.name == 'li':
            # Get four-digit SIC code descriptions
            full_desc = 'SIC4 ' + str(g.contents[0]).strip() + \
                ': ' + str(g.contents[1].contents[0]).strip()
            link = g.contents[1].get('href').encode("utf-8")
        else:
            # Otherwise raise a value error
            raise ValueError('Unexpected element type: ' + g.name)

        # Get the description of the parent group
        if (i > 0) & (clean_desc(full_desc)[1] == 'SIC4'):
            parent_desc = get_parent(majors, i, 'SIC4', 'Industry Group')
        else:
            parent_desc = major_desc

        # Add to running list of named tuples
        majors.append(ind_group(full_desc, parent_desc, link))

    return majors

Ejemplo n.º 4

Mostrar archivo

def get_youtube_info(url):
    if 'channel' in url:
        tag = 'channel'
    elif 'playlist' in url:
        tag = 'playlist'
    elif 'user' in url:
        tag = 'user'
    else:
        return {}
    sep = '=' if tag == 'playlist' else '/'
    id_ = url.split(sep)[-1]
    prefix = 'https://www.youtube.com/feeds/videos.xml?'
    postfix = '_id=' if tag != 'user' else '='
    xml = prefix + tag + postfix + id_
    soup = get_soup(xml)
    name = soup.find('title').text
    return {
        'name': name,
        'tag': tag,
        'rss': xml
    }

Ejemplo n.º 5

Mostrar archivo

Archivo: scrape_sic_sec.py Proyecto: pfg3/SIC-list

def get_sic_sec():
    """Scrape SIC codes from SEC website
    """

    # Setup
    soup = get_soup(config.SEC_base_url)
    table = soup.find_all('table')[3]

    # Convert HTML to nested list
    data = []
    for row in table.find_all('tr'):
        cols = row.find_all('td')
        cols = [ele.text.strip().replace('  ', ' ') for ele in cols]
        if (len(cols) > 1):
            data.append([ele.encode('utf-8') for ele in cols if ele])

    # Clean headers
    if data[0] != config.SEC_expected_columns:
        warnings.warn('Warning: column names have changed in URL ' +
                      config.SEC_base_url)
    data[0] = config.SEC_columns

    return data

Ejemplo n.º 6

Mostrar archivo

Archivo: e17.py Proyecto: abuyinn/practice-python

import soup as s

url = "https://www.nytimes.com/"

if __name__ == "__main__":
    print(
        "Note:\n"
        "\tThe html is now very strange "
        "and there is no sense to check whether this is 100% accurate.\n"
        "\tThe structure and tags will be different in a couple of years anyway.\n\n"
    )

    soup = s.get_soup(url)

    print(*s.get_all_tags(soup, 'span'), sep="\n")
    print(*s.get_all_tags(soup, 'h2'), sep="\n")

Ejemplo n.º 7

Mostrar archivo

 def __init__(self, url, last_date, item_name='item'):
     self.soup = get_soup(url)
     self.item = item_name
     self.last_date = last_date