Ejemplos de extract en Python

Lenguaje de programación: Python

Namespace/Package Name: bs4.element

Método / Función: extract

Ejemplos en hotexamples.com: 4

Python extract - 4 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de bs4.element.extract extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

Archivo: getSearchList.py Proyecto: daowa/inbooks_spider

def searchGongZhongHao(word, count):
    # 抓取网页
    soup = requestSoup("http://weixin.sogou.com/weixin?type=2&query=" + word + 
                       "&ie=utf8&s_from=input&_sug_=y&_sug_type_=&w=01019900&sut=1713&sst0=1499419256365&lkt=1%2C1499419256260%2C1499419256260")
    # 去掉注释
    for element in soup(text=lambda text: isinstance(text, Comment)):
        element.extract()
    # 结构化处理
    this_search_list = []
    for result in soup.find(class_='news-list').find_all('li', limit=count):
        search = {} #第一遍遍历结果的时候将search初始化
        search['title'] = ''.join([item.string for item in result.find('h3').find('a').contents])
        search['introduce'] = ''.join([item.string for item in result.find(class_='txt-info')])
        search['from_'] = 'gongzhognhao_' + result.find(class_='account').string
        search['url'] = result.find(class_='txt-box').find('a')['href']
        search['image'] = result.find('img')['src'] #搜狗好像会屏蔽不是从搜索跳过来的，之后不行的话就置为''
        search['time'] = time.time()
        this_search_list.append(search)
    return this_search_list

Ejemplo n.º 2

Mostrar archivo

Archivo: postprocessor.py Proyecto: lorthang/OpenDSA

def break_up_sections(path, module_data, config):
  book_name = config.book_name
  sections = module_data['sections']
  module_map = config['module_map']
  course_id = config.course_id
  item_url = config.LMS_url+"/courses/{course_id}/modules/items/{item_id}"
  assignment_url = config.LMS_url+"/courses/{course_id}/assignments/{assignment_id}?module_item_id={module_item_id}"

  # Read contents of module HTML file
  try:
    with codecs.open(path, 'r', 'utf-8') as html_file:
      html = html_file.read()
  except IOError:
    print "Error: Could not find HTML file for", path
    return {}

  # Get the module name and create its subfolder
  mod_name = os.path.splitext(os.path.basename(path))[0]

  # Strip out the script, style, link, and meta tags

  soup = BeautifulSoup(html, "html.parser")

  verbose = False

  if verbose:
    print "Found HTML file:", mod_name

  TAGS = [ ('script', 'src'), ('link', 'href'), ('img', 'src'), ('a', 'href') ]

  # KILL MATHJAX
  #'''Helpful for debugging, because MathJax takes forever to load'''
  #for possible_math_jax in soup.find_all('script'):
  #  if possible_math_jax.has_attr('src') and possible_math_jax['src'].startswith('//cdn.mathjax.org/mathjax'):
  #    possible_math_jax.extract()


  # Find all of the scripts, links, images, etc. that we might need
  for tag_name, tag_url in TAGS+[('div', 'data-frame-src')]:
    for a_tag in soup(tag_name):
      if a_tag.has_attr(tag_url):
        if triple_up.match(a_tag[tag_url]):
          a_tag[tag_url] = 'OpenDSA/' + a_tag[tag_url][len('../../../'):]
        elif a_tag[tag_url].startswith('_static/'):
          a_tag[tag_url] = 'OpenDSA/Books/'+book_name+'/html/'+a_tag[tag_url]
        elif a_tag[tag_url].startswith('_images/'):
          a_tag[tag_url] = 'OpenDSA/Books/'+book_name+'/html/'+a_tag[tag_url]


  '''
  Skip any sections that don't have points

  '''

  # Redirect href urls
  for link in soup.find_all('a'):
    if 'href' not in link.attrs:
        # Really? No href? Is that even valid HTML?
        continue
    href = link['href']
    if href == '#':
      # Skip dummy urls redirecting to itself
      continue
    elif href.startswith('#'):
      # Do something with an internal page link
      continue
    elif href.startswith('mailto:'):
      # Email
      continue
    elif href.startswith('http://'):
      # Offsite
      continue
    elif href.startswith('../'):
      # Current directory
      continue
    elif href.endswith('.rst'):
      # The source reference
      continue
    else:
      if '#' in href:
        external, internal = href.split('#', 1)
      else:
        external, internal = href, ''
      if external.endswith('.html'):
        # Snip off the ".html"
        external = external[:-5]
        # Map it to the proper folder in canvas
        if external in module_map:
          module_obj = module_map[external]
          if 'assignment_id' in module_map[external]:
            external = assignment_url.format(course_id=course_id, module_item_id=module_obj.get('module_item_id'), assignment_id=module_obj.get('assignment_id'))
          else:
            external = item_url.format(course_id=course_id, item_id=module_obj.get('item_id'))
        # Force it to approach it from the top
        link['href'] = '#'.join((external,internal))
      # Do something with the actual href

  # Move header scripts out of header, kill header
  header_tag = soup.find('div', class_='header')
  for bit in reversed(header_tag.contents):
    if bit.name in ('script', 'link'):
      header_tag.next_sibling.insert_before(bit.extract())
  header_tag.extract()

  # Remove unnecessary parts of the HTML
  for class_name in ('topnav', 'bottomnav'):
    element = soup.find('div', class_=class_name)
    if element:
      element.extract()
  element = soup.find('img', alt='nsf')
  if element:
    element.extract()

  total_real_exercises = len(sections)#0
  #for exercise, properties in exercises.items():
  #  if 'points' in properties:
  #    total_real_exercises += 1
  if total_real_exercises <= 1:
    if total_real_exercises == 0:
      filename = mod_name+'.html'
    else:
      filename = mod_name+'-01.html'
    single_file_path = os.path.join(os.path.dirname(path), '..', 'lti_html', filename)
    with codecs.open(single_file_path, 'w', 'utf-8') as o:
      o.write(unicode(soup))
    return None


  # Collect out the slide-specific JS/CSS
  slide_scripts = defaultdict(list)
  all_scripts = []

  for tag_name, tag_url in TAGS:
    if tag_name == 'link': continue
    # Expand this to handle src
    for a_tag in soup.find_all(tag_name):
      if a_tag.has_attr(tag_url) and (
            a_tag[tag_url].startswith('OpenDSA/AV/')
            or a_tag[tag_url].startswith('OpenDSA/DataStructures/')):
        name = os.path.splitext(os.path.basename(a_tag[tag_url]))[0]
        script_tag = a_tag.extract()
        if "CON" in name and tag_name == "script":
            slide_scripts[name].append(script_tag)
        else:
            all_scripts.append(script_tag)
        #if name.endswith('Common.css'):

  # Breaking file into components

  # First pass: grab out all of the HTML fragments
  content_div_soup = soup.find('div', class_='content')
  section_divs_soup = content_div_soup.find_all('div', class_='section', recursive=False)
  content_div = []
  # A body is an HTML fragment within a subsection
  total_bodies = 0
  # Iterate over the top-level sections
  for section_div_soup in section_divs_soup:
    section_div = []
    # And then iterate over the second-level sections
    for subsection_div_soup in list(section_div_soup.contents):
      subsection_div = []
      if (subsection_div_soup.name == 'div'
          and subsection_div_soup.has_attr('class')
          and 'section' in subsection_div_soup['class']):
        # This is a subsection, grab its children
        for body_soup in list(subsection_div_soup.contents):
          #if verbose: print "\t\tSUB"
          subsection_div.append( ( body_soup.parent, body_soup.extract() ) )
          total_bodies += 1
      else:
        # This is section starter content.
        #if verbose: print "\t\tSection"
        body_soup = subsection_div_soup
        subsection_div.append( ( body_soup.parent, body_soup.extract() ) )
        total_bodies += 1
      # Capture this subsection into this section
      section_div.append(subsection_div)
    # Capture this section into the complete content
    content_div.append(section_div)
  if verbose:
    print "\tPhase 1: Found {} pieces of body content".format(total_bodies)

  # Second pass: cluster body fragments by exercises into "slides"
  total_exercises = 0
  slides = []
  new_slide = []
  found = []
  previous_parent = None
  i = 0
  for section_div in content_div:
    #print "\tNew Section"
    for subsection_div in section_div:
      #print "\t\tNew Subsection"
      name = "NEXT SLIDE"
      for parent, body in subsection_div:
        #print "\t\t\t", str(body)[:40]
        new_slide.append( (parent, body) )
      body_text = [str(s[1]) for s in new_slide
                   if s[1].name != 'span' or
                    not s[1].has_attr('id') or
                    (not s[1]['id'].startswith('index-') and
                     not s[1]['id'].startswith('id1')
                    )]
      if not ''.join(body_text).strip():
        continue
      slides.append((name, new_slide))
      new_slide = []
      total_exercises += 1
      found.append(name)


  if verbose:
    print "\tPhase 2: Clustered into {} slides. Found {} sections, expected {}.".format(len(slides), total_exercises-1, len(sections))

  # Add the slide general scripts to the top.
  sgs_div = soup.new_tag('div', id='SLIDE-GENERAL-SCRIPTS')
  for script_tag in all_scripts:
    sgs_div.insert(0, script_tag)
  content_div_soup.insert_before(sgs_div)

  # third pass: render them out with the relevant scripts
  for index, (slide_name, slide) in enumerate(slides):
    #if verbose: print "\tSlide", index, len(slide)
    # Identify the new filename
    slide_filename = '{0}-{1:02d}.html'.format(mod_name, index)
    slide_filepath = os.path.join(os.path.dirname(path), '..', 'lti_html', slide_filename)
    # Add the relevant content back in
    for body_index, (parent, body) in enumerate(slide):
      parent.insert(body_index, body)
    if index != 0:
      potential_exercises = sections.values()[index-1].keys()
    else:
      potential_exercises = []
    sss_div = soup.new_tag('div', id='SLIDE-SPECIFIC-SCRIPTS')
    for potential_exercise in potential_exercises:
      if potential_exercise in slide_scripts:
        for a_script in slide_scripts[potential_exercise]:
          #if verbose: print "\t\t", id(a_script), str(a_script)
          sss_div.insert(0, a_script)
      if potential_exercise in ('quicksortCON', 'bubblesortCON'):
        for a_script in slide_scripts[potential_exercise.replace('CON', 'CODE')]:
          sss_div.insert(0, a_script)
    #if verbose: print(len(sss_div))
    # Add back in slide specific scripts
    sgs_div.insert_after(sss_div)
    if index != 0:
        potential_exercises = sections.values()[index-1].keys()
    else:
        potential_exercises = []
    # Write out the file with what we have so far
    with codecs.open(slide_filepath, 'w', 'utf-8') as o:
      o.write(unicode(soup))
    sss_div.decompose()
    for parent, body in slide:
      body.extract()

  if verbose:
    print "\tPhase 3: complete"

Ejemplo n.º 3

Mostrar archivo

Archivo: postprocessor.py Proyecto: OpenDSA/OpenDSA

def break_up_sections(path, module_data, config):
  print(path)
  book_name = config.book_name
  sections = module_data['sections']
  module_map = config['module_map']
  course_id = config.course_id
  if bool(module_map):
    item_url = config.LMS_url+"/courses/{course_id}/modules/items/{module_item_id}"
    assignment_url = config.LMS_url+"/courses/{course_id}/assignments/{assignment_id}?module_item_id={module_item_id}"

  # Read contents of module HTML file
  try:
    with codecs.open(path, 'r', 'utf-8') as html_file:
      html = html_file.read()
  except IOError:
    print "Error: Could not find HTML file for", path
    return {}

  # Get the module name and create its subfolder
  mod_name = os.path.splitext(os.path.basename(path))[0]

  # Strip out the script, style, link, and meta tags

  soup = BeautifulSoup(html, "html.parser")

  verbose = False

  if verbose:
    print "Found HTML file:", mod_name

  TAGS = [ ('script', 'src'), ('link', 'href'), ('img', 'src'), ('a', 'href'), ('iframe', 'src') ]

  # KILL MATHJAX
  #'''Helpful for debugging, because MathJax takes forever to load'''
  #for possible_math_jax in soup.find_all('script'):
  #  if possible_math_jax.has_attr('src') and possible_math_jax['src'].startswith('//cdnjs.cloudflare.com/ajax/libs/mathjax'):
  #    possible_math_jax.extract()


  # Find all of the scripts, links, images, etc. that we might need
  for tag_name, tag_url in TAGS+[('div', 'data-frame-src')]:
    for a_tag in soup(tag_name):
      if a_tag.has_attr(tag_url):
        match = triple_up.match(a_tag[tag_url])
        if match:
          a_tag[tag_url] = '/OpenDSA/' + a_tag[tag_url][len(match.group(0)):]
        elif a_tag[tag_url].startswith('_static/'):
          a_tag[tag_url] = '/OpenDSA/Books/'+book_name+'/html/'+a_tag[tag_url]
        elif a_tag[tag_url].startswith('_images/'):
          a_tag[tag_url] = '/OpenDSA/Books/'+book_name+'/html/'+a_tag[tag_url]

  '''
  Skip any sections that don't have points

  '''

  # Redirect href urls
  for link in soup.find_all('a'):
    if 'href' not in link.attrs:
        # Really? No href? Is that even valid HTML?
        continue
    href = link['href']
    if href == '#':
      # Skip dummy urls redirecting to itself
      continue
    elif href.startswith('#'):
      # Do something with an internal page link
      continue
    elif href.startswith('mailto:'):
      # Email
      continue
    elif href.startswith('http://'):
      # Offsite
      continue
    elif href.startswith('../'):
      # Current directory
      continue
    elif href.endswith('.rst'):
      # The source reference
      continue
    else:
      if '#' in href:
        external, internal = href.split('#', 1)
      else:
        external, internal = href, ''
      if external.endswith('.html'):
        # Snip off the ".html"
        external = external[:-5]

        # Map it to the proper folder in canvas
        if bool(module_map):
          if external in module_map:
            module_obj = module_map[external]
            if 'assignment_id' in module_map[external] and module_obj.get('assignment_id') != None:
              external = assignment_url.format(course_id=course_id, module_item_id=module_obj.get('module_item_id'), assignment_id=module_obj.get('assignment_id'))
            else:
              external = item_url.format(course_id=course_id, module_item_id=module_obj.get('module_item_id'))
        # Force it to approach it from the top
        link['href'] = '#'.join((external,internal))
      # Do something with the actual href

  # Move header scripts out of header, kill header
  header_tag = soup.find('div', class_='header')
  for bit in reversed(header_tag.contents):
    if bit.name in ('script', 'link'):
      header_tag.next_sibling.insert_before(bit.extract())
  header_tag.extract()

  # Remove unnecessary parts of the HTML
  for class_name in ('topnav', 'bottomnav'):
    element = soup.find('div', class_=class_name)
    if element:
      element.extract()
  element = soup.find('img', alt='nsf')
  if element:
    element.extract()

  filename = mod_name + '.html'
  single_file_path = os.path.join(os.path.dirname(path), '..', 'lti_html', filename)
  with codecs.open(single_file_path, 'w', 'utf-8') as o:
    o.write(unicode(soup))
  return None

Ejemplo n.º 4

Mostrar archivo

def break_up_sections(path, module_data, config):
    print(path)
    book_name = config.book_name
    sections = module_data['sections']
    module_map = config['module_map']
    course_id = config.course_id
    if bool(module_map):
        item_url = config.LMS_url + "/courses/{course_id}/modules/items/{module_item_id}"
        assignment_url = config.LMS_url + "/courses/{course_id}/assignments/{assignment_id}?module_item_id={module_item_id}"

    # Read contents of module HTML file
    try:
        with codecs.open(path, 'r', 'utf-8') as html_file:
            html = html_file.read()
    except IOError:
        print "Error: Could not find HTML file for", path
        return {}

    # Get the module name and create its subfolder
    mod_name = os.path.splitext(os.path.basename(path))[0]

    # Strip out the script, style, link, and meta tags

    soup = BeautifulSoup(html, "html.parser")

    verbose = False

    if verbose:
        print "Found HTML file:", mod_name

    TAGS = [('script', 'src'), ('link', 'href'), ('img', 'src'), ('a', 'href')]

    # KILL MATHJAX
    #'''Helpful for debugging, because MathJax takes forever to load'''
    #for possible_math_jax in soup.find_all('script'):
    #  if possible_math_jax.has_attr('src') and possible_math_jax['src'].startswith('//cdnjs.cloudflare.com/ajax/libs/mathjax'):
    #    possible_math_jax.extract()

    # Find all of the scripts, links, images, etc. that we might need
    for tag_name, tag_url in TAGS + [('div', 'data-frame-src')]:
        for a_tag in soup(tag_name):
            if a_tag.has_attr(tag_url):
                if triple_up.match(a_tag[tag_url]):
                    a_tag[tag_url] = '/OpenDSA/' + a_tag[tag_url][
                        len('../../../../../../'):]
                elif a_tag[tag_url].startswith('_static/'):
                    a_tag[
                        tag_url] = '/OpenDSA/Books/' + book_name + '/html/' + a_tag[
                            tag_url]
                elif a_tag[tag_url].startswith('_images/'):
                    a_tag[
                        tag_url] = '/OpenDSA/Books/' + book_name + '/html/' + a_tag[
                            tag_url]
    '''
  Skip any sections that don't have points

  '''

    # Redirect href urls
    for link in soup.find_all('a'):
        if 'href' not in link.attrs:
            # Really? No href? Is that even valid HTML?
            continue
        href = link['href']
        if href == '#':
            # Skip dummy urls redirecting to itself
            continue
        elif href.startswith('#'):
            # Do something with an internal page link
            continue
        elif href.startswith('mailto:'):
            # Email
            continue
        elif href.startswith('http://'):
            # Offsite
            continue
        elif href.startswith('../'):
            # Current directory
            continue
        elif href.endswith('.rst'):
            # The source reference
            continue
        else:
            if '#' in href:
                external, internal = href.split('#', 1)
            else:
                external, internal = href, ''
            if external.endswith('.html'):
                # Snip off the ".html"
                external = external[:-5]

                # Map it to the proper folder in canvas
                if bool(module_map):
                    if external in module_map:
                        module_obj = module_map[external]
                        if 'assignment_id' in module_map[
                                external] and module_obj.get(
                                    'assignment_id') != None:
                            external = assignment_url.format(
                                course_id=course_id,
                                module_item_id=module_obj.get(
                                    'module_item_id'),
                                assignment_id=module_obj.get('assignment_id'))
                        else:
                            external = item_url.format(
                                course_id=course_id,
                                module_item_id=module_obj.get(
                                    'module_item_id'))
                # Force it to approach it from the top
                link['href'] = '#'.join((external, internal))
            # Do something with the actual href

    # Move header scripts out of header, kill header
    header_tag = soup.find('div', class_='header')
    for bit in reversed(header_tag.contents):
        if bit.name in ('script', 'link'):
            header_tag.next_sibling.insert_before(bit.extract())
    header_tag.extract()

    # Remove unnecessary parts of the HTML
    for class_name in ('topnav', 'bottomnav'):
        element = soup.find('div', class_=class_name)
        if element:
            element.extract()
    element = soup.find('img', alt='nsf')
    if element:
        element.extract()

    filename = mod_name + '.html'
    single_file_path = os.path.join(os.path.dirname(path), '..', 'lti_html',
                                    filename)
    with codecs.open(single_file_path, 'w', 'utf-8') as o:
        o.write(unicode(soup))
    return None