def searchGongZhongHao(word, count): # 抓取网页 soup = requestSoup("http://weixin.sogou.com/weixin?type=2&query=" + word + "&ie=utf8&s_from=input&_sug_=y&_sug_type_=&w=01019900&sut=1713&sst0=1499419256365&lkt=1%2C1499419256260%2C1499419256260") # 去掉注释 for element in soup(text=lambda text: isinstance(text, Comment)): element.extract() # 结构化处理 this_search_list = [] for result in soup.find(class_='news-list').find_all('li', limit=count): search = {} #第一遍遍历结果的时候将search初始化 search['title'] = ''.join([item.string for item in result.find('h3').find('a').contents]) search['introduce'] = ''.join([item.string for item in result.find(class_='txt-info')]) search['from_'] = 'gongzhognhao_' + result.find(class_='account').string search['url'] = result.find(class_='txt-box').find('a')['href'] search['image'] = result.find('img')['src'] #搜狗好像会屏蔽不是从搜索跳过来的,之后不行的话就置为'' search['time'] = time.time() this_search_list.append(search) return this_search_list
def break_up_sections(path, module_data, config): book_name = config.book_name sections = module_data['sections'] module_map = config['module_map'] course_id = config.course_id item_url = config.LMS_url+"/courses/{course_id}/modules/items/{item_id}" assignment_url = config.LMS_url+"/courses/{course_id}/assignments/{assignment_id}?module_item_id={module_item_id}" # Read contents of module HTML file try: with codecs.open(path, 'r', 'utf-8') as html_file: html = html_file.read() except IOError: print "Error: Could not find HTML file for", path return {} # Get the module name and create its subfolder mod_name = os.path.splitext(os.path.basename(path))[0] # Strip out the script, style, link, and meta tags soup = BeautifulSoup(html, "html.parser") verbose = False if verbose: print "Found HTML file:", mod_name TAGS = [ ('script', 'src'), ('link', 'href'), ('img', 'src'), ('a', 'href') ] # KILL MATHJAX #'''Helpful for debugging, because MathJax takes forever to load''' #for possible_math_jax in soup.find_all('script'): # if possible_math_jax.has_attr('src') and possible_math_jax['src'].startswith('//cdn.mathjax.org/mathjax'): # possible_math_jax.extract() # Find all of the scripts, links, images, etc. that we might need for tag_name, tag_url in TAGS+[('div', 'data-frame-src')]: for a_tag in soup(tag_name): if a_tag.has_attr(tag_url): if triple_up.match(a_tag[tag_url]): a_tag[tag_url] = 'OpenDSA/' + a_tag[tag_url][len('../../../'):] elif a_tag[tag_url].startswith('_static/'): a_tag[tag_url] = 'OpenDSA/Books/'+book_name+'/html/'+a_tag[tag_url] elif a_tag[tag_url].startswith('_images/'): a_tag[tag_url] = 'OpenDSA/Books/'+book_name+'/html/'+a_tag[tag_url] ''' Skip any sections that don't have points ''' # Redirect href urls for link in soup.find_all('a'): if 'href' not in link.attrs: # Really? No href? Is that even valid HTML? continue href = link['href'] if href == '#': # Skip dummy urls redirecting to itself continue elif href.startswith('#'): # Do something with an internal page link continue elif href.startswith('mailto:'): # Email continue elif href.startswith('http://'): # Offsite continue elif href.startswith('../'): # Current directory continue elif href.endswith('.rst'): # The source reference continue else: if '#' in href: external, internal = href.split('#', 1) else: external, internal = href, '' if external.endswith('.html'): # Snip off the ".html" external = external[:-5] # Map it to the proper folder in canvas if external in module_map: module_obj = module_map[external] if 'assignment_id' in module_map[external]: external = assignment_url.format(course_id=course_id, module_item_id=module_obj.get('module_item_id'), assignment_id=module_obj.get('assignment_id')) else: external = item_url.format(course_id=course_id, item_id=module_obj.get('item_id')) # Force it to approach it from the top link['href'] = '#'.join((external,internal)) # Do something with the actual href # Move header scripts out of header, kill header header_tag = soup.find('div', class_='header') for bit in reversed(header_tag.contents): if bit.name in ('script', 'link'): header_tag.next_sibling.insert_before(bit.extract()) header_tag.extract() # Remove unnecessary parts of the HTML for class_name in ('topnav', 'bottomnav'): element = soup.find('div', class_=class_name) if element: element.extract() element = soup.find('img', alt='nsf') if element: element.extract() total_real_exercises = len(sections)#0 #for exercise, properties in exercises.items(): # if 'points' in properties: # total_real_exercises += 1 if total_real_exercises <= 1: if total_real_exercises == 0: filename = mod_name+'.html' else: filename = mod_name+'-01.html' single_file_path = os.path.join(os.path.dirname(path), '..', 'lti_html', filename) with codecs.open(single_file_path, 'w', 'utf-8') as o: o.write(unicode(soup)) return None # Collect out the slide-specific JS/CSS slide_scripts = defaultdict(list) all_scripts = [] for tag_name, tag_url in TAGS: if tag_name == 'link': continue # Expand this to handle src for a_tag in soup.find_all(tag_name): if a_tag.has_attr(tag_url) and ( a_tag[tag_url].startswith('OpenDSA/AV/') or a_tag[tag_url].startswith('OpenDSA/DataStructures/')): name = os.path.splitext(os.path.basename(a_tag[tag_url]))[0] script_tag = a_tag.extract() if "CON" in name and tag_name == "script": slide_scripts[name].append(script_tag) else: all_scripts.append(script_tag) #if name.endswith('Common.css'): # Breaking file into components # First pass: grab out all of the HTML fragments content_div_soup = soup.find('div', class_='content') section_divs_soup = content_div_soup.find_all('div', class_='section', recursive=False) content_div = [] # A body is an HTML fragment within a subsection total_bodies = 0 # Iterate over the top-level sections for section_div_soup in section_divs_soup: section_div = [] # And then iterate over the second-level sections for subsection_div_soup in list(section_div_soup.contents): subsection_div = [] if (subsection_div_soup.name == 'div' and subsection_div_soup.has_attr('class') and 'section' in subsection_div_soup['class']): # This is a subsection, grab its children for body_soup in list(subsection_div_soup.contents): #if verbose: print "\t\tSUB" subsection_div.append( ( body_soup.parent, body_soup.extract() ) ) total_bodies += 1 else: # This is section starter content. #if verbose: print "\t\tSection" body_soup = subsection_div_soup subsection_div.append( ( body_soup.parent, body_soup.extract() ) ) total_bodies += 1 # Capture this subsection into this section section_div.append(subsection_div) # Capture this section into the complete content content_div.append(section_div) if verbose: print "\tPhase 1: Found {} pieces of body content".format(total_bodies) # Second pass: cluster body fragments by exercises into "slides" total_exercises = 0 slides = [] new_slide = [] found = [] previous_parent = None i = 0 for section_div in content_div: #print "\tNew Section" for subsection_div in section_div: #print "\t\tNew Subsection" name = "NEXT SLIDE" for parent, body in subsection_div: #print "\t\t\t", str(body)[:40] new_slide.append( (parent, body) ) body_text = [str(s[1]) for s in new_slide if s[1].name != 'span' or not s[1].has_attr('id') or (not s[1]['id'].startswith('index-') and not s[1]['id'].startswith('id1') )] if not ''.join(body_text).strip(): continue slides.append((name, new_slide)) new_slide = [] total_exercises += 1 found.append(name) if verbose: print "\tPhase 2: Clustered into {} slides. Found {} sections, expected {}.".format(len(slides), total_exercises-1, len(sections)) # Add the slide general scripts to the top. sgs_div = soup.new_tag('div', id='SLIDE-GENERAL-SCRIPTS') for script_tag in all_scripts: sgs_div.insert(0, script_tag) content_div_soup.insert_before(sgs_div) # third pass: render them out with the relevant scripts for index, (slide_name, slide) in enumerate(slides): #if verbose: print "\tSlide", index, len(slide) # Identify the new filename slide_filename = '{0}-{1:02d}.html'.format(mod_name, index) slide_filepath = os.path.join(os.path.dirname(path), '..', 'lti_html', slide_filename) # Add the relevant content back in for body_index, (parent, body) in enumerate(slide): parent.insert(body_index, body) if index != 0: potential_exercises = sections.values()[index-1].keys() else: potential_exercises = [] sss_div = soup.new_tag('div', id='SLIDE-SPECIFIC-SCRIPTS') for potential_exercise in potential_exercises: if potential_exercise in slide_scripts: for a_script in slide_scripts[potential_exercise]: #if verbose: print "\t\t", id(a_script), str(a_script) sss_div.insert(0, a_script) if potential_exercise in ('quicksortCON', 'bubblesortCON'): for a_script in slide_scripts[potential_exercise.replace('CON', 'CODE')]: sss_div.insert(0, a_script) #if verbose: print(len(sss_div)) # Add back in slide specific scripts sgs_div.insert_after(sss_div) if index != 0: potential_exercises = sections.values()[index-1].keys() else: potential_exercises = [] # Write out the file with what we have so far with codecs.open(slide_filepath, 'w', 'utf-8') as o: o.write(unicode(soup)) sss_div.decompose() for parent, body in slide: body.extract() if verbose: print "\tPhase 3: complete"
def break_up_sections(path, module_data, config): print(path) book_name = config.book_name sections = module_data['sections'] module_map = config['module_map'] course_id = config.course_id if bool(module_map): item_url = config.LMS_url+"/courses/{course_id}/modules/items/{module_item_id}" assignment_url = config.LMS_url+"/courses/{course_id}/assignments/{assignment_id}?module_item_id={module_item_id}" # Read contents of module HTML file try: with codecs.open(path, 'r', 'utf-8') as html_file: html = html_file.read() except IOError: print "Error: Could not find HTML file for", path return {} # Get the module name and create its subfolder mod_name = os.path.splitext(os.path.basename(path))[0] # Strip out the script, style, link, and meta tags soup = BeautifulSoup(html, "html.parser") verbose = False if verbose: print "Found HTML file:", mod_name TAGS = [ ('script', 'src'), ('link', 'href'), ('img', 'src'), ('a', 'href'), ('iframe', 'src') ] # KILL MATHJAX #'''Helpful for debugging, because MathJax takes forever to load''' #for possible_math_jax in soup.find_all('script'): # if possible_math_jax.has_attr('src') and possible_math_jax['src'].startswith('//cdnjs.cloudflare.com/ajax/libs/mathjax'): # possible_math_jax.extract() # Find all of the scripts, links, images, etc. that we might need for tag_name, tag_url in TAGS+[('div', 'data-frame-src')]: for a_tag in soup(tag_name): if a_tag.has_attr(tag_url): match = triple_up.match(a_tag[tag_url]) if match: a_tag[tag_url] = '/OpenDSA/' + a_tag[tag_url][len(match.group(0)):] elif a_tag[tag_url].startswith('_static/'): a_tag[tag_url] = '/OpenDSA/Books/'+book_name+'/html/'+a_tag[tag_url] elif a_tag[tag_url].startswith('_images/'): a_tag[tag_url] = '/OpenDSA/Books/'+book_name+'/html/'+a_tag[tag_url] ''' Skip any sections that don't have points ''' # Redirect href urls for link in soup.find_all('a'): if 'href' not in link.attrs: # Really? No href? Is that even valid HTML? continue href = link['href'] if href == '#': # Skip dummy urls redirecting to itself continue elif href.startswith('#'): # Do something with an internal page link continue elif href.startswith('mailto:'): # Email continue elif href.startswith('http://'): # Offsite continue elif href.startswith('../'): # Current directory continue elif href.endswith('.rst'): # The source reference continue else: if '#' in href: external, internal = href.split('#', 1) else: external, internal = href, '' if external.endswith('.html'): # Snip off the ".html" external = external[:-5] # Map it to the proper folder in canvas if bool(module_map): if external in module_map: module_obj = module_map[external] if 'assignment_id' in module_map[external] and module_obj.get('assignment_id') != None: external = assignment_url.format(course_id=course_id, module_item_id=module_obj.get('module_item_id'), assignment_id=module_obj.get('assignment_id')) else: external = item_url.format(course_id=course_id, module_item_id=module_obj.get('module_item_id')) # Force it to approach it from the top link['href'] = '#'.join((external,internal)) # Do something with the actual href # Move header scripts out of header, kill header header_tag = soup.find('div', class_='header') for bit in reversed(header_tag.contents): if bit.name in ('script', 'link'): header_tag.next_sibling.insert_before(bit.extract()) header_tag.extract() # Remove unnecessary parts of the HTML for class_name in ('topnav', 'bottomnav'): element = soup.find('div', class_=class_name) if element: element.extract() element = soup.find('img', alt='nsf') if element: element.extract() filename = mod_name + '.html' single_file_path = os.path.join(os.path.dirname(path), '..', 'lti_html', filename) with codecs.open(single_file_path, 'w', 'utf-8') as o: o.write(unicode(soup)) return None
def break_up_sections(path, module_data, config): print(path) book_name = config.book_name sections = module_data['sections'] module_map = config['module_map'] course_id = config.course_id if bool(module_map): item_url = config.LMS_url + "/courses/{course_id}/modules/items/{module_item_id}" assignment_url = config.LMS_url + "/courses/{course_id}/assignments/{assignment_id}?module_item_id={module_item_id}" # Read contents of module HTML file try: with codecs.open(path, 'r', 'utf-8') as html_file: html = html_file.read() except IOError: print "Error: Could not find HTML file for", path return {} # Get the module name and create its subfolder mod_name = os.path.splitext(os.path.basename(path))[0] # Strip out the script, style, link, and meta tags soup = BeautifulSoup(html, "html.parser") verbose = False if verbose: print "Found HTML file:", mod_name TAGS = [('script', 'src'), ('link', 'href'), ('img', 'src'), ('a', 'href')] # KILL MATHJAX #'''Helpful for debugging, because MathJax takes forever to load''' #for possible_math_jax in soup.find_all('script'): # if possible_math_jax.has_attr('src') and possible_math_jax['src'].startswith('//cdnjs.cloudflare.com/ajax/libs/mathjax'): # possible_math_jax.extract() # Find all of the scripts, links, images, etc. that we might need for tag_name, tag_url in TAGS + [('div', 'data-frame-src')]: for a_tag in soup(tag_name): if a_tag.has_attr(tag_url): if triple_up.match(a_tag[tag_url]): a_tag[tag_url] = '/OpenDSA/' + a_tag[tag_url][ len('../../../../../../'):] elif a_tag[tag_url].startswith('_static/'): a_tag[ tag_url] = '/OpenDSA/Books/' + book_name + '/html/' + a_tag[ tag_url] elif a_tag[tag_url].startswith('_images/'): a_tag[ tag_url] = '/OpenDSA/Books/' + book_name + '/html/' + a_tag[ tag_url] ''' Skip any sections that don't have points ''' # Redirect href urls for link in soup.find_all('a'): if 'href' not in link.attrs: # Really? No href? Is that even valid HTML? continue href = link['href'] if href == '#': # Skip dummy urls redirecting to itself continue elif href.startswith('#'): # Do something with an internal page link continue elif href.startswith('mailto:'): # Email continue elif href.startswith('http://'): # Offsite continue elif href.startswith('../'): # Current directory continue elif href.endswith('.rst'): # The source reference continue else: if '#' in href: external, internal = href.split('#', 1) else: external, internal = href, '' if external.endswith('.html'): # Snip off the ".html" external = external[:-5] # Map it to the proper folder in canvas if bool(module_map): if external in module_map: module_obj = module_map[external] if 'assignment_id' in module_map[ external] and module_obj.get( 'assignment_id') != None: external = assignment_url.format( course_id=course_id, module_item_id=module_obj.get( 'module_item_id'), assignment_id=module_obj.get('assignment_id')) else: external = item_url.format( course_id=course_id, module_item_id=module_obj.get( 'module_item_id')) # Force it to approach it from the top link['href'] = '#'.join((external, internal)) # Do something with the actual href # Move header scripts out of header, kill header header_tag = soup.find('div', class_='header') for bit in reversed(header_tag.contents): if bit.name in ('script', 'link'): header_tag.next_sibling.insert_before(bit.extract()) header_tag.extract() # Remove unnecessary parts of the HTML for class_name in ('topnav', 'bottomnav'): element = soup.find('div', class_=class_name) if element: element.extract() element = soup.find('img', alt='nsf') if element: element.extract() filename = mod_name + '.html' single_file_path = os.path.join(os.path.dirname(path), '..', 'lti_html', filename) with codecs.open(single_file_path, 'w', 'utf-8') as o: o.write(unicode(soup)) return None