def getFromDB(blog_id, comment_id=None, conn = sqlite3.connect(DB_FILE)): ''' Get a Blog object from the database. Parameters ==================================== blog_id `int|list[int]` - The blog_id to retrieve. comment_id `int` - The comment_id to retrieve. conn `sqlite3.Connection` - A SQLite connection object. Default as the a new connection to the global DB_FILE databse file. Returns ==================================== `Comment|list(Comment)` - a Comment object; a list of comment is returned if no comment_id is given. ''' cur = conn.cursor() blogClause = ("blog_id IN (" + ",".join(blog_id) + ")") if isinstance(blog_id, list) else ("blog_id = " + str(blog_id)) if comment_id is None: cur.execute("SELECT * FROM comments WHERE " + blogClause) commentRows = cur.fetchall() return [Comment(*c) for c in commentRows] else: cur.execute("SELECT * FROM comments WHERE " + blogClause + " AND comment_id = " + str(comment_id)) commentRow = cur.fetchone() return Comment(*commentRow)
def latexify(soup): for item in tagspec: for tag in soup.find_all(name=item[0]): if item[1]: tag.insert_before(Comment(item[1])) if item[2]: tag.insert_after(Comment(item[2]))
def add_image_map(tag, siz): map_tag = tag.find_previous_sibling('map') if not map_tag: return tag #Mostly the scale is half width, as we have two columns. scale = 0.5 #Sometimes that does not fit, so we keep reducing. while (siz[0] * scale) > 280: scale *= 0.8 #Some images are in one column sections, so twice the size if tag.find_parent("div", class_="full-width"): print("Found a BIG image") scale *= 2 #image may need to be on a new page. preamble = '\n\n\\par\\Needspace{' + str( 30 + siz[1] * scale) + 'pt}\\begin{picture}(' + str( siz[0] * scale) + ',' + str(siz[1] * scale) + ')\n' postamble = '' for area in map_tag.find_all(name='area'): if area.has_attr('coords') and area.has_attr('href') and area.has_attr( 'shape'): if (area['shape'] == 'rect'): label = label_of_ref(area['href']) coords = area['coords'].split(',') #Calculations with w and h are because the HTML hotspots have #y down the page, and LaTeX ones go up the page. #And we have to play games with strings, ints and scaling. coords = [int(x) * scale for x in coords] # convert to numbers. # print( 'coord ',coords ) x, y, x1, y1 = coords w = x1 - x y = siz[1] * scale - y y1 = siz[1] * scale - y1 h = y - y1 # Do not include the rather wide hotspots. # Each rectangle is 'put' into the picture. if (w < (520 * scale)): x, y1, w, h = [str(k) for k in [x, y1, w, h]] postamble += ' \\put(' + x + ',' + y1 + '){\\hyperref[\\foo{' + label + '}]{\\makebox(' + w + ',' + h + '){}}}\n' postamble += '\\end{picture}\n\n' tag.insert_before(Comment(preamble)) tag.insert_after(Comment(postamble)) #The image itself, using put, is before the puts for the areas. tag.insert_before( Comment(' \\put(0,0){\\includegraphics[scale=' + str(scale) + ']{')) tag.insert_after(Comment('}}\n')) return tag
def patch(filepath): if ("php" in filepath): patch_php(filepath) return 0 try: with open(filepath) as inf: txt = inf.read() # soup = BeautifulSoup(txt, 'html.parser') soup = BeautifulSoup(txt, "html5lib") mydiv = soup.head.find('script', {'class': 'gtm'}) if not mydiv: scrTag = Tag(soup, name='script') scrTag['class'] = "gtm" scrTag.string = headSnippet soup.head.insert(0, Comment('End Google Tag Manager')) soup.head.insert(0, scrTag) soup.head.insert(0, Comment('Google Tag Manager')) #scrTag.insert_before(Comment('Google Tag Manager')) #scrTag.insert_after(Comment('End Google Tag Manager')) # insert body snippet into the document iframeTag = Tag(soup, name='iframe') iframeTag['src'] = iframeSrc iframeTag['height'] = "0" iframeTag['width'] = "0" iframeTag['style'] = "display:none;visibility:hidden" noscrTag = Tag(soup, name='noscript') noscrTag['class'] = 'gtm' noscrTag.insert(0, iframeTag) soup.body.insert(0, Comment('End Google Tag Manager (noscript)')) soup.body.insert(0, noscrTag) soup.body.insert(0, Comment('Google Tag Manager (noscript)')) #noscrTag.insert_before(Comment('Google Tag Manager (noscript)')) #noscrTag.insert_after(Comment('End Google Tag Manager (noscript)')) # save the file again with open(filepath, 'w') as outf: outf.write(str(soup)) except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) return -1 except: print "Unexpected error:", sys.exc_info()[0] return -2 print "Analytics Patched Successfully" return 0
def createJSONfile(url, soupObject): ''' Creates a JSON file writing in it the string from the soupObject. ''' my_title = getTitle(soupObject) my_time = getTime() #Adding comments to the BeautifulSoup object. tag = soupObject.html new_comment = Comment('\nWebpage title: ' + my_title + ';\n' + 'Webpage extracted from: ' + url + ';\n' + 'Webpage time extraction: ' + my_time + ';\n\n') tag.insert_before(new_comment) #Make the soup object readable. p_soup_html = soupObject.prettify() #Convert into a JSON. y = json.dumps(p_soup_html) with open('%s.json' % my_title, 'w', encoding='utf-8') as file: file.writelines(y) file.close()
def getOtherComments(self, retreiveCount = 5, blog_ids = None): ''' Get a list of comments not in this blog. Parameters ==================================== retreiveCount `int` - The topmost k comments, recommend a small integer smaller than 30 blog_ids `list[int]` - A pre-fetched blog_id list Returns ==================================== `list(Comment)` - a list of Comment objects. ''' cur = self.conn.cursor() if blog_ids is None: blog_ids = Blog.getIDs(self.conn) blog_ids = random.sample(blog_ids, retreiveCount+1) if (self.blog_id in blog_ids): del blog_ids[blog_ids.index(self.blog_id)] cur.execute("SELECT * FROM comments WHERE blog_id IN (" + ",".join([str(bi) for bi in blog_ids]) + ") ORDER BY RANDOM() LIMIT " + str(retreiveCount)) comments = cur.fetchall() return [Comment(*c) for c in comments]
def getCommentsFromSimilarBlogs(self, conn2 = sqlite3.connect(DB_FILE2), topK = 10, retreiveCount = 5, orderedBy = "random", cachedWordList = None, logKeywords = False, printBlogTitles=False): ''' Get a list of comments with the same tfidf as this blog. Parameters ==================================== conn2 `sqlite3.Connection` - A SQLite connection object for the word dictionary. Default as the a new connection to the global DB_FILE2 databse file. topK `int` - The top-K tf-idf words to be selected for comparisons. retreiveCount `None|int` - The topmost k comments, all if None is given. Returns ==================================== `list(Comment)` - a list of Comment objects. ''' cur = self.conn.cursor() # Select a list of comments. similarBlogs = self.getSimilarBlogs(conn2, topK, retreiveCount, orderedBy=orderedBy, logKeywords=logKeywords, cachedWordList=cachedWordList) if (printBlogTitles): print([str(b.title) for b in similarBlogs]) cur.execute("SELECT * FROM comments WHERE blog_id IN (" + ",".join([str(b.blog_id) for b in similarBlogs]) + ") ORDER BY RANDOM()" + ("" if retreiveCount is None else (" LIMIT " + str(retreiveCount)))) return [Comment(*c) for c in cur.fetchall()]
def response(self, flow: http.HTTPFlow): response = flow.response if CONTENT_TYPE in response.headers: if any( map(lambda t: t in response.headers[CONTENT_TYPE], RELEVANT_CONTENT_TYPES)): # Response is a web page; proceed. insertedScripts: List[str] = [] soup = BeautifulSoup(response.content, HTML_PARSER, from_encoding=inferEncoding(response)) requestURL = flow.request.pretty_url # should work in transparent mode too, unless the Host header is spoofed isApplicable: Callable[[Userscript], bool] = userscript.applicableChecker( requestURL) for script in self.userscripts: if isApplicable(script): useInline = ctx.options.inline or script.downloadURL is None if useInline and len(script.unsafeSequences) > 0: logError(unsafeSequencesMessage(script)) continue logInfo( f"""Injecting {script.name}{"" if script.version is None else " " + VERSION_PREFIX + script.version} into {requestURL} ({"inline" if useInline else "linked"}) ...""" ) result = inject( script, soup, Options( inline=ctx.options.inline, verbose=ctx.options.verbose, )) if type(result) is BeautifulSoup: soup = result insertedScripts.append(script.name + ( "" if script.version is None else " " + stringifyVersion(script.version))) else: logError( "Injection failed due to the following error:") logError(str(result)) index_DTD: Optional[int] = indexOfDTD(soup) # Insert information comment: if ctx.options.verbose: soup.insert( 0 if index_DTD is None else 1 + index_DTD, Comment(INFO_COMMENT_PREFIX + ("No matching userscripts for this URL." if insertedScripts == [] else "These scripts were inserted:\n" + bulletList(insertedScripts)) + "\n")) # Prevent BS/html.parser from emitting `<!DOCTYPE doctype html>` or similar if "DOCTYPE" is not all uppercase in source HTML: if index_DTD is not None and REGEX_DOCTYPE.match( soup.contents[index_DTD]): # There is a DTD and it is invalid, so replace it. soup.contents[index_DTD] = Doctype( re.sub(REGEX_DOCTYPE, "", soup.contents[index_DTD])) # Serialize and encode: response.content = str(soup).encode( fromOptional(soup.original_encoding, CHARSET_DEFAULT), "replace")
def response(self, flow: http.HTTPFlow): response = flow.response if CONTENT_TYPE in response.headers: if any( map(lambda t: t in response.headers[CONTENT_TYPE], RELEVANT_CONTENT_TYPES)): # Response is a web page; proceed. insertedScripts: List[str] = [] soup = BeautifulSoup(response.content, HTML_PARSER, from_encoding=inferEncoding(response)) requestURL = flow.request.pretty_url # should work in transparent mode too, unless the Host header is spoofed if requestContainsQueryParam( option(T.option_query_param_to_disable), flow.request): logInfo( f"""Not injecting any userscripts into {requestURL} because it contains a `{option(T.option_query_param_to_disable)}` query parameter.""" ) return isApplicable: Callable[[Userscript], bool] = userscript.applicableChecker( requestURL) for script in self.userscripts: if isApplicable(script): useInline = option( T.option_inline) or script.downloadURL is None if useInline and len(script.unsafeSequences) > 0: logError(unsafeSequencesMessage(script)) continue logInfo( f"""Injecting {script.name}{"" if script.version is None else " " + VERSION_PREFIX + script.version} into {requestURL} ({"inline" if useInline else "linked"}) ...""" ) result = inject( script, soup, Options(inline=option(T.option_inline), )) if type(result) is BeautifulSoup: soup = result insertedScripts.append(script.name + ( "" if script.version is None else " " + T.stringifyVersion(script.version))) else: logError( "Injection failed due to the following error:") logError(str(result)) index_DTD: Optional[int] = indexOfDTD(soup) # Insert information comment: if option(T.option_list_injected): soup.insert( 0 if index_DTD is None else 1 + index_DTD, Comment(HTML_INFO_COMMENT_PREFIX + ("No matching userscripts for this URL." if insertedScripts == [] else "These scripts were inserted:\n" + bulletList(insertedScripts)) + "\n")) # Serialize and encode: response.content = str(soup).encode( fromOptional(soup.original_encoding, CHARSET_DEFAULT), "replace")
def as_html(self, inline=False): div = Tag(name='div') if inline: div.append(Comment(str(self))) pass else: p = Tag(name='p') p.append('Location not known more precisely.') div.append(p) return div
def _tweakHTML(self, soup, manifest, swJS): #TODO: adding a DOCTYPE seems to mess with the finished game's layout, a browser issue, quirks mode?... #prefix with <!DOCTYPE html>... #doctype = Doctype('html') #soup.insert(0, doctype) #tweak head... head = soup.head comment = Comment("This file has been modified by pwap8 (https://github.com/loxodromic/pwap8)") head.insert(0, comment) #add some meta tags for colours, icons, etc... head.append(soup.new_tag('meta', attrs={'name': 'theme-color', 'content': '#cccccc'})) head.append(soup.new_tag('meta', attrs={'name': 'apple-mobile-web-app-capable', 'content': 'yes'})) head.append(soup.new_tag('meta', attrs={'name': 'apple-mobile-web-app-status-bar-style', 'content':'#222222'})) head.append(soup.new_tag('meta', attrs={'name': 'apple-mobile-web-app-title', 'content':soup.title.string})) head.append(soup.new_tag('meta', attrs={'name': 'msapplication-TileImage', 'content':"images/{name}-icon-144.png".format(name=self.projectNameShort)})) head.append(soup.new_tag('meta', attrs={'name': 'msapplication-TileColor', 'content':'#cccccc'})) #favicons... head.append(soup.new_tag('link', attrs={'rel': 'apple-touch-icon', 'href': "images/{name}-icon-167.png.png".format(name=self.projectNameShort)})) if self.faviconStyle == "png": head.append(soup.new_tag('link', attrs={'rel':'icon', 'href':'favicon-32.png', 'type':'image/png'})) elif self.faviconStyle == "ico": head.append(soup.new_tag('link', attrs={'rel':'icon', 'href':'favicon.ico', 'type':'image/x-icon'})) #manifest... if self.bInlineManifest: manifestStr = json.dumps(manifest, indent=4, sort_keys=False) head.append(soup.new_tag('link', attrs={'rel':'manifest', 'href':'data:application/manifest+json,' + manifestStr})) else: head.append(soup.new_tag('link', attrs={'rel':'manifest', 'href':"{name}.manifest".format(name=self.projectNameShort)})) #tweak body... body = soup.body #something for when JavaScrript is off... fallbackContent = soup.new_tag("noscript") fallbackContent.string = "This will much be more fun with JavaScript enabled." body.append(fallbackContent) #service worker... #TODO: can we inline the service worker?... startSW = soup.new_tag("script", attrs={'type':'text/javascript'}) startSW.string = "window.onload = () => { 'use strict'; if ('serviceWorker' in navigator) { navigator.serviceWorker.register('./sw.js');}}" body.append(startSW)
def inject_live_server_script(path): with open(path) as fp: soup = BeautifulSoup(fp, features='html.parser') head = soup.find('head') if head is None: head_tag = soup.new_tag('head') soup.append(head_tag) head = soup.find('head') live_server_script_tag = soup.new_tag(name='script', attrs={'src': '/liveServer.js'}) head.append(Comment('injected by live-server')) head.append(live_server_script_tag) b_soup = soup.encode() return b_soup
def inject_live_server_script(path): try: with open(path) as fp: # TODO use passed path soup = BeautifulSoup(fp, features='html.parser') head = soup.find('head') if head is None: head_tag = soup.new_tag('head') soup.append(head_tag) head = soup.find('head') live_server_script_tag = soup.new_tag( name='script', attrs={'src': '/liveServer.js'}) head.append(Comment('injected by live-server')) head.append(live_server_script_tag) b_soup = soup.encode() return b_soup except FileNotFoundError: # TODO throw or send 404 return "noo"
def getThisComments(self, retreiveCount = None): ''' Get a list of comments under this blog. Parameters ==================================== retreiveCount `None|int` - The topmost k comments, all if None is given. Returns ==================================== `list(Comment)` - a list of Comment objects. ''' cur = self.conn.cursor() cur.execute("SELECT * FROM comments WHERE blog_id = " + str(self.blog_id) + ((" ORDER BY RANDOM() LIMIT " + str(retreiveCount)) if retreiveCount is not None else "")) comments = cur.fetchall() return [Comment(*c) for c in comments]
def getCommentsFromSimilarTags(self, retreiveCount = 5): ''' Get a list of comments with the same tags as this blog. Parameters ==================================== retreiveCount `None|int` - The topmost k comments, all if None is given. Returns ==================================== `list(Comment)` - a list of Comment objects. ''' cur = self.conn.cursor() # Select a list of comments. cur.execute("SELECT * FROM comments WHERE blog_id IN (" + ",".join([str(b.blog_id) for b in self.getSimilarBlogsByTags(retreiveCount)]) + ") ORDER BY RANDOM()" + ("" if retreiveCount is None else (" LIMIT " + str(retreiveCount)))) return [Comment(*c) for c in cur.fetchall()]
def getCommentsFromSimilarTFIDF(self, conn2 = sqlite3.connect(DB_FILE2), topK = 10, retreiveCount = 5): ''' Get a list of comments with the same tfidf as this blog. Parameters ==================================== conn2 `sqlite3.Connection` - A SQLite connection object for the word dictionary. Default as the a new connection to the global DB_FILE2 databse file. topK `int` - The top-K tf-idf words to be selected for comparisons. retreiveCount `None|int` - The topmost k comments, all if None is given. Returns ==================================== `list(Comment)` - a list of Comment objects. ''' cur = self.conn.cursor() # Select a list of comments. cur.execute("SELECT * FROM comments WHERE blog_id IN (" + ",".join([str(b.blog_id) for b in self.getSimilarBlogsByTFIDF(conn2, topK, retreiveCount)]) + ") ORDER BY RANDOM()" + ("" if retreiveCount is None else (" LIMIT " + str(retreiveCount)))) return [Comment(*c) for c in cur.fetchall()]
from bs4 import BeautifulSoup, Comment import requests soup = BeautifulSoup('<b><!--Yo soy un comentario HTML--></b>', 'html.parser') print(soup.b.string) print(type(soup.b.string)) comment = Comment(' #mycomment ') soup.b.string.replace_with(comment) print(soup.b) print(type(soup.b.string))
for j in range((len(singlewords) - 1) / wordsperquery + 1): words = singlewords[j * wordsperquery:(j + 1) * wordsperquery] if len(words) > 1: searches.append('("' + '" OR "'.join(words) + '")') else: searches.append('"' + words[0] + '"') for words in multwords: searches.append('(' + ' AND '.join(words) + ')') startyear = str(now.year - 1) stopyear = str(now.year + 1) startsearch = int(sys.argv[1]) stopsearch = int(sys.argv[2]) rpp = 30 chunksize = 50 publisher = 'oatd.org' comment = Comment('Kommentar') #check already harvested ejldirs = [ '/afs/desy.de/user/l/library/dok/ejl/onhold', '/afs/desy.de/user/l/library/dok/ejl', '/afs/desy.de/user/l/library/dok/ejl/zu_punkten', '/afs/desy.de/user/l/library/dok/ejl/zu_punkten/enriched', '/afs/desy.de/user/l/library/dok/ejl/backup', '/afs/desy.de/user/l/library/dok/ejl/backup/%i' % (now.year - 1) ] redoki = re.compile('THESES.OATD') renodoi = re.compile('^I\-\-NODOI:(.*)\-\-$') bereitsin = [] for ejldir in ejldirs: print ejldir for datei in os.listdir(ejldir):
def main(qhp_file): """TODO: Docstring for main. :returns: TODO """ src_file = open(qhp_file) dst_file = open( os.path.dirname(src_file.name) + os.path.sep + 'index-fix.qhp', 'w') soup = BeautifulSoup(src_file, 'xml') keywords = soup.findAll('keyword') for keyword in keywords: kid = keyword['id'] m = re.match(r'lav.*::(.*)$', kid) if m: keyword['id'] = m.group(1) pass # DO NOT use soup.prettify # qhelpgenerator CAN NOT recognize the format dst_file.write(str(soup)) # see https://github.com/mmmarcos/doxygen2qtcreator/blob/master/doxygen2qtcreator.py # popup tooltips for function call MUST have the format # <!-- $$$function_name[overload1]$$$ --> # <div class='memdoc'> # <p>Only the first p tag can be show in popup tooltips/hover documentation</p> # <!-- @@@function_name --> # ... # ... # </div> files = soup.find_all('file') common_dir = os.path.dirname(src_file.name) for f in files: html_file = open(common_dir + os.path.sep + f.text, 'rw+') if html_file: html_soup = BeautifulSoup(html_file, 'html.parser') memitems = html_soup.find_all('div', {'class': 'memitem'}) should_write_back_to_file = False if memitems: for item in memitems: memname = item.find('td', {'class': 'memname'}) memdoc = item.find('div', {'class': 'memdoc'}) if memdoc and memname > 0: html_text = memname.get_text() names = html_text.strip(' ').split(' ') # Only handle function call name # ffmpeg av_xxxxx # int function_call_name if len(names) == 2 and names[1].startswith('av'): # TODO:merge multiple <p> tags in memdoc # QtCreator only pick the first <p> tag to display in the tooltips marker_start = u' $$${0}[overload1]$$$ '.format( names[1]) marker_end = u' @@@{0} '.format(names[1]) memdoc.insert_before(Comment(marker_start)) memdoc.insert_after(Comment(marker_end)) should_write_back_to_file = True pass if should_write_back_to_file: print 'insert QtCreator style marker for %s' % html_file.name html_file.seek(0) # DO NOT prettify # for the code in the html, use unicode is more readable html_file.write(unicode(html_soup).encode('utf-8')) html_file.close() pass src_file.close() dst_file.close() print 'Done, /path/to/qhelpgenerator %s -o index.qch' % dst_file.name print 'Then, attach index.qch file to your QtCreator' print 'Tool -> Options -> Help -> Documentation -> Add' pass
def note_error(self, elem, message, strip): elem.append(Comment(message)) if strip: elem.parent().clear()
root = os.getcwd() target_path = root+"/images" file = open("index.html","r") webpage = file.read() soup = BeautifulSoup(webpage,'html.parser') get_target_div = soup.find('div',{'id':'lightgallery'}) photo_list = [photos['href'].replace('images/','').lower() for photos in get_target_div.find_all('a')] # print(photo_list) pointer = soup.find('div', {'id': 'lightgallery'}) for dirName, subdirList, fileList in os.walk(target_path, topdown=False): rel_dir = os.path.relpath(dirName, target_path) if rel_dir.startswith('.'): continue comment_tag = Comment(" "+rel_dir.upper()+" ") pointer.append(comment_tag) print('=== %s ===' % comment_tag) for fname in fileList: if fname.startswith('.'): continue if "thumb-" in fname: continue if fname.lower() not in photo_list: new_soup = BeautifulSoup("", "html.parser") new_tag = new_soup.new_tag("a",attrs={'class':"grid-item", 'href':"images/"+rel_dir+"/"+fname}) new_soup.append(new_tag) new_tag = new_soup.new_tag("img", attrs={'src':"images/"+rel_dir+"/thumb-"+fname}) new_soup.a.append(new_tag)
def replacement(cursor, wp_posts, shorten_url_dict, shorten_url_keys, cat_dict, post_tag_dict, imported_idd={}): features = get_features(cursor) feature_ids = [f['term_id'] for f in features] wp_post_lists = [] wp_post_dict = {} h = HTMLParser() for wp_post in wp_posts: # extract wplink and remove all [wplink ...] in content. matches = re.findall(r'(\[wplink name="(.*)"\])', wp_post['post_content']) short_link_dict = {} short_links = [] for i in matches: full, part = i if part in shorten_url_keys: short_links.append(part) if len(short_links) > 0: z = hashlib.md5(''.join(sorted(short_links))).hexdigest() x = {} for short_link in short_links: x[short_link] = [ shorten_url_dict[short_link]['link'], shorten_url_dict[short_link]['title'] ] wp_post['wplink'] = [z, x] # fix newline at <span> & オススメ記事 wp_post['post_content'] = wp_post['post_content'].replace( '\r\n<span', '\r\n\r\n<span') # add more 1 newline add_newline_lists = [ '</h1>', '</h2>', '</h3>', '</h4>', '</h5>', '</table>', '</p>', '</blockquote>', '</ul>', '</ol>' ] for add_newline_list in add_newline_lists: wp_post['post_content'] = wp_post['post_content'].replace( add_newline_list, add_newline_list + "\r\n") # add <br> if needed lists_without_br = [ '<table', '<thead', '<tbody', '<td', '<th', '<tr', '</table>', '</thead>', '</tbody>', '</td>', '</th>', '</tr>', '<p>', '</p>', '</li>' ] ts = wp_post['post_content'].split('\r\n\r\n') for i, v in enumerate(ts): t = ts[i].strip() need_replace = True for lwb in lists_without_br: if t.find(lwb) != -1: need_replace = False break if need_replace: ts[i] = ts[i].replace('\r\n', '<br>\r\n') wp_post['post_content'] = '\r\n\r\n'.join(ts) # remove width & height attribute wp_post['post_content'] = re.sub(r'(.*) width="\d+"(.*)', r'\1\2', wp_post['post_content']) wp_post['post_content'] = re.sub(r'(.*) height="\d+"(.*)', r'\1\2', wp_post['post_content']) # replace [caption] to html format wp_post['post_content'] = re.sub( r'\[caption(.*)\](.*>)(.*)\[\/caption\]', r'<div class="media">\2<div class="caption">\3</div></div>', wp_post['post_content']) # remove [nextpage] #wp_post['post_content'] = re.sub(r'\[\/nextpage\]', '', wp_post['post_content']) #wp_post['post_content'] = re.sub(r'\[nextpage(.*)\]', '', wp_post['post_content']) pid = wp_post['ID'] wp_post_dict[pid] = wp_post wp_post_dict[pid]['meta'] = {} wp_post_dict[pid]['related_article_ids'] = [] wp_post_dict[pid]['related_article_titles'] = [] wp_postmeta_result = get_wp_metadata_by_post_id(cursor, pid) for wp_postmeta in wp_postmeta_result: wp_post_dict[wp_postmeta['post_id']]['meta'][ wp_postmeta['meta_key']] = wp_postmeta['meta_value'] if wp_postmeta['meta_key'] == 'simple_related_posts': # convert related_articles ra_ids = sorted(list( set( map( int, re.findall( r'"(\d+)"', wp_post_dict[wp_postmeta['post_id']]['meta'] ['simple_related_posts'])))), reverse=True) ra_ids = [ ra_id for ra_id in ra_ids if not check_if_fisco(cursor, ra_id) and ra_id in imported_idd ] wp_post_dict[ wp_postmeta['post_id']]['related_article_ids'] = ra_ids # XXX: set default title wp_post_dict[ wp_postmeta['post_id']]['related_article_titles'] = [ 'x' for _ in ra_ids ] del wp_post_dict[wp_postmeta['post_id']]['meta'][ wp_postmeta['meta_key']] for k in wp_post_dict: _wp_post = wp_post_dict[k] # fix html_content. change double newline into <p> tag. sps = _wp_post['post_content'].split('\r\n\r\n') for idx, val in enumerate(sps): if sps[idx][:3] != '<p>': sps[idx] = '<p>{}</p>'.format(val) _wp_post['post_content'] = '\r\n'.join(sps) # insert <br> after some tags. _wp_post['post_content'] = re.sub(r'</strong>\r\n', '</strong><br>\r\n', _wp_post['post_content']) _wp_post['post_content'] = re.sub(r'</a>\r\n', '</a><br>\r\n', _wp_post['post_content']) _wp_post['post_content'] = re.sub(r'<p>【(.*)オススメ(.*)】\r\n', r'<p>【\g<1>オススメ\g<2>】<br>\r\n', _wp_post['post_content']) # create soup post_content_soup = BeautifulSoup(_wp_post['post_content'], "lxml") # remove class,id,name and style in html. for tag in post_content_soup(): if isinstance(tag, Tag): for attribute in ["class", "id", "name", "style"]: if tag.name == 'div' and 'class' in tag.attrs and ( 'media' in tag.attrs['class'] or 'caption' in tag.attrs['class']): continue del tag[attribute] # fix html_content. wrap NavigableString into a <p> tag. for k, v in enumerate(post_content_soup.body.findAll(recursive=False)): if isinstance(v, NavigableString): new_p_tag = post_content_soup.new_tag('p') if post_content_soup.body.contents[k].strip() == 'nextpage': new_p_tag.append(Comment('nextpage')) else: new_p_tag.string = unicode(v) post_content_soup.body.contents[k] = new_p_tag post_content_soup.html.unwrap() post_content_soup.body.unwrap() # process <blockquote> for match in post_content_soup.findAll('blockquote'): mf = match.findAll(recursive=False) match.contents = [m for m in match.contents if m != '\n'] for k, v in enumerate(mf): if isinstance(v, Tag) and v.name != 'p' and v.name != 'br': new_p_tag = post_content_soup.new_tag('p') new_p_tag.string = v.text match.contents[k] = new_p_tag if len(mf) == 0: new_p_tag = post_content_soup.new_tag('p') new_p_tag.string = match.text match.string = '' match.insert(0, new_p_tag) # remove span for match in post_content_soup.findAll('span'): match.replaceWithChildren() # remove <a> outside of <img> for match in post_content_soup.findAll('img'): if isinstance(match.parent, Tag) and match.parent.name == 'a': try: if re.match(r'.*\.(jpg|png|gif|bmp)', match.parent['href']).group(): match.parent.unwrap() except: pass #try: # new_br_tag = post_content_soup.new_tag('br') # match.parent.insert(-1, new_br_tag) #except: # pass #if isinstance(match.parent, Tag) and match.parent.name == 'p': # match.parent['style'] = 'text-align: center;' # wrap div outside of table for v in post_content_soup.findAll('table'): new_div_tag = post_content_soup.new_tag('div', **{'class': 'tableWrap'}) contents = v.replace_with(new_div_tag) new_div_tag.append(contents) # wrap div outside of iframe which src is youtube.com/xxx for v in post_content_soup.findAll('iframe'): if v['src'] is not None and v['src'].find('www.youtube.com') != -1: new_div_tag = post_content_soup.new_tag( 'div', **{'class': 'youtube'}) contents = v.replace_with(new_div_tag) new_div_tag.append(contents) # process <!--nextpage--> comments = post_content_soup.find_all( string=lambda text: isinstance(text, Comment)) for comment in comments: if comment == 'nextpage': pp = comment.parent try: ct = 1 pps = pp.find_previous_sibling() while True: if ct > 5: break if len(pps.findChildren('a')) > 0: pps.extract() break else: pps = pps.find_previous_sibling() ct += 1 pp.unwrap() except: pass _wp_post['post_content'] = post_content_soup.prettify(indent_width=2) # cleanup empty tags _wp_post['post_content'] = _wp_post['post_content'].replace( '<p>\n <br/>\n</p>', '') _wp_post['post_content'] = _wp_post['post_content'].replace( '<p>\n</p>', '') # replace <a> tag which values are https://localhost.com/archives/ZZZ reps = [] a_tags = post_content_soup.findAll('a') for a_tag in a_tags: try: matches = re.search(r'^https:\/\/localhost.com\/archives', a_tag['href']) if matches is not None: reps.append(a_tag['href']) except: pass # replace absolute link into relative. for rep in reps: r = rep.split('https://localhost.com/archives')[1] _wp_post['post_content'] = _wp_post['post_content'].replace( rep, '/archives' + r) # XXX: fix [nextpage] format error if _wp_post['ID'] in fix_nextpage_dicts.keys(): for tp in fix_nextpage_dicts[_wp_post['ID']]: _wp_post['post_content'] = _wp_post['post_content'].replace( *tp) # unescape html _wp_post['post_content'] = h.unescape(_wp_post['post_content']) # trim html tags _content = post_content_soup.text # validate meta key for k in [ '_aioseop_keywords', '_aioseop_description', '_aioseop_title', 'subtitle' ]: if k not in _wp_post['meta']: _wp_post['meta'][k] = '' _wp_post['post_content'] = _wp_post['post_content'].replace( 'https://localhost.com/wp-content/uploads/', 'https://stg.localhost/640/480/uploads/') _post = { 'id': _wp_post['ID'], 'operator_id': 0, # TODO: 'author_id': _wp_post['post_author'], 'editor_id': 1, 'category_id': 0, 'image_id': 1, 'company_id': 0, 'title': _wp_post['post_title'], 'content': _content, 'lead_content': _content[:140], 'html_content': _wp_post['post_content'], 'sub_title': _wp_post['meta']['subtitle'], 'meta_description': _wp_post['meta']['_aioseop_description'], 'meta_keywords': _wp_post['meta']['_aioseop_keywords'], 'meta_title': _wp_post['meta']['_aioseop_title'], 'noindex_flg': False, 'nofollow_flg': False, 'nolist_flg': False, 'ogp_image_config': 1, 'twitter_card': 2, 'amp_flg': False, 'instant_articles_flg': False, 'status': 1, 'trash_flg': False, 'created_at': _wp_post['post_date'], 'updated_at': _wp_post['post_modified'], 'image_urls': [], 'related_article_ids': _wp_post['related_article_ids'], 'related_article_titles': _wp_post['related_article_titles'], #'image_urls': [img['src'] for img in post_content_soup.findAll('img') if 'src' in img], } for img in post_content_soup.findAll('img'): try: isrc = img['src'] _post['image_urls'].append(isrc) except: pass if 'wplink' in _wp_post: _post['wplink'] = _wp_post['wplink'] if _wp_post['post_status'] == 'publish' or _wp_post[ 'post_status'] == 'future': _post['published_at'] = _wp_post['post_date'] # change to features when import if 'series' in _wp_post['meta'] and _wp_post['meta']['series'] != "": _post['series_id'] = _wp_post['meta']['series'] else: # query => select * from wp_term_relationships where term_taxonomy_id = 774; se = xs(cursor, feature_ids) if se is not None: _post['series_id'] = se['term_taxonomy_id'] else: _post['series_id'] = 0 ctrls = [] try: ctrls = phpserialize.loads(_wp_post['meta']['pr']).values() except: pass _post['is_pr'] = '588' in ctrls _post['is_hide'] = '587' in ctrls if _post['is_hide']: _post['nolist_flg'] = True try: if _wp_post['meta']['_aioseop_noindex'] == 'on': _post['noindex_flg'] = True except: pass try: if _wp_post['meta']['_aioseop_nofollow'] == 'on': _post['nofollow_flg'] = True except: pass ## START add categories relations into post sql = "SELECT * FROM wp_term_relationships where object_id = {}".format( _wp_post['ID']) cursor.execute(sql) wp_term_relationships_result = cursor.fetchall() for wtr in wp_term_relationships_result: if wtr['term_taxonomy_id'] in cat_dict: _post['category_id'] = cat_dict[ wtr['term_taxonomy_id']]['term_id'] break ## END ## START add tags relations into post _post['tag_ids'] = [] is_fisco = False for wtr in wp_term_relationships_result: if wtr['term_taxonomy_id'] in post_tag_dict: # check if article is Fisco if post_tag_dict[wtr['term_taxonomy_id']]['term_id'] == 541: is_fisco = True _post['tag_ids'].append( post_tag_dict[wtr['term_taxonomy_id']]['term_id']) _pid = post_tag_dict[wtr['term_taxonomy_id']]['parent'] while _pid != 0: if _pid not in post_tag_dict: break _post['tag_ids'].append(post_tag_dict[_pid]['term_id']) _pid = post_tag_dict[_pid]['parent'] # Don't import Fisco articles if not is_fisco: wp_post_lists.append(_post) return wp_post_lists
def cleanup_soup(soup): # Remove existing comments for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): comment.extract() # Remove the wikimedia TOC (there are 3 tags to remove) for tag in soup.find_all(name='span', id='Contents'): tag = tag.parent tag = tag.next_sibling tag = tag.next_sibling tag.previous_sibling.extract() tag.previous_sibling.extract() tag.extract() # Remove more wikimedia cruft (sidebar, footer) for tag in soup.find_all(): if tag.has_attr('id'): if tag['id'] == 'jump-to-nav': tag.extract() if tag['id'] == 'siteSub': tag.extract() if tag['id'] == 'contentSub': tag.extract() if tag['id'] == "column-one": tag.extract() if tag['id'] == "footer": tag.extract() if tag['id'] == "toc": tag.extract() # ul tag may be bad and need an li. # html is fine without, but the latexified version would # otherwise baulk at a missing \item. for tag in soup.find_all(name='ul'): if not tag.contents[0].name == 'li': print("Bad ul tag fixed") tag.insert(0, Comment("\\item ")) # Our two column mode # Each file is a chapter, starting at h1. # And with the 2-col environment inside it. # So go do that for this html tag = soup.body if tag: # The title is one column tag2 = soup.find('h1') # The empty argument to label_of_ref will give a label for this # source file, at its start. if tag2: tag2.insert_after( Comment('latex \\label{' + label_of_ref('') + '}')) tag2.insert_after(Comment('latex \\begin{multicols}{2}')) else: tag.insert(0, Comment('latex \\begin{multicols}{2}')) tag.insert(0, Comment('latex \\label{' + label_of_ref('') + '}')) tag.insert(-1, Comment('latex \\end{multicols}')) # Most text is two column. Fix up the sections marked as full width. for tag in soup.find_all(name='div', class_="full-width"): tag.insert(0, Comment('\\end{multicols}\n')) tag.insert(-1, Comment('\\begin{multicols}{2}\n')) # anchors become \hyperrefs and \labels # provided they are relative. for tag in soup.find_all(name='a'): if tag.has_attr('href'): if not tag.find(name='img'): if not tag['href'].startswith('http'): label = label_of_ref(tag['href']) #print( "hyperref: ", label ) tag.insert_before( Comment('latex \n\\hyperref[\\foo{' + label + '}]{')) tag.insert_after(Comment('latex }\n')) # divs may provide \labels for tag in soup.find_all(name='div'): if tag.has_attr('id') and not tag.contents: label = label_of_ref(tag['id']) #print( "label: ", label ) #insert label after the heading, if there was one #(this gets more accurate LaTeX hyperlink landings) #otherwise just insert it anyway. tag2 = tag.find_next_sibling(re.compile('^h\d')) if tag2: tag2.insert_after(Comment('latex \n\\label{' + label + '}')) else: print('No title for ' + label) tag.insert_before(Comment('latex \n\\label{' + label + '}')) # (valid) images get treated depending on their size # all our images are screenshots, so we just check sizes in pixels. # - small images are inline, and are already sized (using dpi) for inline use # - large images are 72 dpi, and will be at most one column width. for tag in soup.find_all(name='img'): if tag.has_attr('src'): png_filename = abs_filename_of_relative_link(tag['src']) if os.path.isfile(png_filename): with Image.open(png_filename) as image: siz = image.size if tag.has_attr('usemap'): # no \par needed or used for image map. tag = add_image_map(tag, siz) elif siz[0] > 60 or siz[1] > 30: #Bigger images... #print( png_filename ) tag.insert_before( Comment( '\n\\par\\includegraphics[max width=\\linewidth]{' )) tag.insert_after(Comment('}\\par\n')) else: #small inline image #the \texorpdfstring is because these inline images #may occur in section headings, and without \texorpdfstring #that would mess up the conversion to pdf which does not like #images in the labels. tag.insert_before( Comment( '\\texorpdfstring{\\protect\\includegraphics[max width=\\linewidth]{' )) tag.insert_after(Comment('}}{}')) # file name is used by includegraphics, so put it in. tag.insert(0, Comment(png_filename.replace('\\', '/')))
# поиск тега, можно использовать css-селекторы или вместе bs.find_all(name, attrs, recursive, string, limit, **kwargs) # поиск и получение списка найденных элементов, можно искать # с помощью регулярок, сразу список элементов, можно передать # True он найдёт все теги. bs.select('p #author') # возвращает список объектов Tag, предоставляющий в BeautifulSoup # HTML-элементы. Поиск с помощью гибкости css-селекторов bs.select_one(css_selector) # поиск первого тега соответствующего селектору bs.new_tag("a", href="http://www.example.com") # создаёт новый тег bs.stripped_strings # bs.original_encoding # автоопределение кодировки NavigableString('example text') # конструктор контента, который затем можно вставить в тег Comment('this comment') # конструктор комментария, который затем можно вставить в # документ
def htmlPreview(data): if "OPENSHIFT_APP_UUID" in os.environ: html_file = './app-root/repo/data/Email Security/template/Cisco_Email_Security_NLG_Template_v1.html' else: html_file = './data/Email Security/template/Cisco_Email_Security_NLG_Template_v1.html' file = codecs.open(html_file, 'r', 'utf-8') soup = BeautifulSoup(file, "html.parser") # Adding Main heading and sub heading main_heading = soup.find(id="main_heading") cisco_img = main_heading.td text_heading = main_heading.td.find_next('td') text_heading.string = data['intromainheading'] text_heading.append(soup.new_tag('br')) text_heading.append(soup.new_string(data['introsubheading'])) #Adding Customer name in email customer_name = soup.find(id='customername') customer_name.td.string.replace_with('\r\n\t\t\t\tHi {},\r\n'.format( data['first_name'])) # Finding introtext in html introtext_1 = soup.find(id='introtext1') # Replacing introtext with data introtext1 = {k: v for k, v in data.items() if k.startswith("introtext1")} i = 0 for key, value in sorted(introtext1.items()): if (i == 0): introtext_1.string = value i = i + 1 else: introtext_1.append(soup.new_tag('br')) introtext_1.append(soup.new_string(value)) # Finding recommendation link in html r_link = soup.find(id='recommendation') r_link['href'] = data['recommendationlink'] # Finding introtext in html introtext_2 = soup.find(id='introtext2') # Replacing Second introtext with data introtext2 = {k: v for k, v in data.items() if k.startswith("introtext2")} i = 0 for key, value in sorted(introtext2.items()): if (i == 0): introtext_2.string = value i = i + 1 else: introtext_2.append(soup.new_tag('br')) introtext_2.append(soup.new_string(value)) # Replacing status icon for feautres sicon = soup.find(id="statusicon") # print(sicon) sicon.img['src'] = data['statusicon'] #Replacing 2nd main heading heading2main = soup.find(id="heading2main") heading2main.string = data['headingtwomain'] heading2sub = soup.find(id="heading2sub") heading2sub.string = data['heading2sub'] #Updating features details features = soup.find(id="features_status") features_text = { k: v for k, v in data.items() if k.startswith("feature") & k.endswith("text") } new_tag = {} j = 1 for i in range(1, len(features_text) + 1): if features_text['feature{}text'.format(i)] != '': new_tag["feature{}".format(j)] = copy.copy(features.tr) new_tag["feature{}".format(j)].td.string = features_text[ 'feature{}text'.format(i)] new_tag["feature{}".format(j)].img['src'] = data[ 'feature{}statusimg'.format(i)] j += 1 features.tbody.clear() # features.decompose() print("length of new_tag", len(new_tag)) if (len(new_tag) == 0): features.decompose() else: for k in range(1, len(new_tag) + 1): features.tbody.append(Comment("LICENSE STATUS " + str(k))) features.tbody.append(new_tag["feature{}".format(k)]) features.tbody.append(Comment("LICENSE STATUS END " + str(k))) # for key,value in new_tag.items(): # print("came here") # features.tbody.append( Comment("LICENSE STATUS FOR " + str(key))) # features.tbody.append(value) # features.tbody.append( Comment("LICENSE STATUS END FOR" + str(key))) #Activation link activation_link = soup.find(id="activationlink") if (data['clicktoactivatelink'] == ''): activation_link.decompose() else: activation_link['href'] = data['clicktoactivatelink'] # Contact details heading3 = soup.find(id="heading3main") heading3.strong.string = data['heading3main'] text3 = soup.find(id="text3") text3.string = data['textbox3'] outtext = soup.find(id="outrotext") outtext.string = data['outrotext'] html_content = soup.prettify() return html_content
def buildTFIDF(self): # Create new tf-idf tables cur2 = self.conn2.cursor() print("DB Initiation - Creating tf-idf tables") cur2.execute('''DROP TABLE IF EXISTS blogs_tf_idf''') cur2.execute('''DROP TABLE IF EXISTS blogs_title_tf_idf''') cur2.execute('''DROP TABLE IF EXISTS comments_tf_idf''') self.conn2.commit() cur2.execute('''CREATE TABLE blogs_tf_idf (blog_id INTEGER, word_id INTEGER, count INTEGER, tf_idf FLOAT, PRIMARY KEY(blog_id,word_id), FOREIGN KEY(word_id) REFERENCES word_dict(id))''') self.conn2.commit() cur2.execute('''CREATE TABLE blogs_title_tf_idf (blog_id INTEGER, word_id INTEGER, count INTEGER, tf_idf FLOAT, PRIMARY KEY(blog_id,word_id), FOREIGN KEY(word_id) REFERENCES word_dict(id))''') self.conn2.commit() cur2.execute('''CREATE TABLE comments_tf_idf (blog_id INTEGER, comment_id INTEGER, word_id INTEGER, count INTEGER, tf_idf FLOAT, PRIMARY KEY(blog_id,comment_id,word_id), FOREIGN KEY(word_id) REFERENCES word_dict(id))''') self.conn2.commit() print("DB TFIDF Initialization - Loop Entries") cur = self.conn.cursor() # Select the title and blog ids form all the blogs allEntries = cur.execute("SELECT blog_id,title,body FROM blogs" + ("" if self.rowLimit is None else (" LIMIT " + str(self.rowLimit)))) blogsTFIDF = dict() blogsTitleTFIDF = dict() commentsTFIDF = dict() idx = 0 # Loop all the blogs for tf-idf preparation blogCount = Blog.getCount(self.conn) if self.rowLimit is None else self.rowLimit for i in allEntries: # Segment the title and push into the counter allWordsTitle = self.transformTextToIDs(i[1]) titleCounter = collections.Counter(allWordsTitle) eleLen = sum(titleCounter.values()) # There may be cases with no valid words found if (eleLen > 0): blogsTitleTFIDF[i[0]] = {w[1]: (ctn, ctn/eleLen*w[4]) for w,ctn in titleCounter.items()} # Segment the body and push into the counter allWordsBody = self.transformTextToIDs(i[2]) bodyCounter = collections.Counter(allWordsBody) eleLen = sum(bodyCounter.values()) # There may be cases with no valid words found if (eleLen > 0): blogsTFIDF[i[0]] = {w[1]: (ctn, ctn/eleLen*w[4]) for w,ctn in bodyCounter.items()} # Get the comments and push all the words comments = Comment.getFromDB(i[0]) commentsTFIDF[i[0]] = dict() for c in comments: allWordsComment = self.transformTextToIDs(c.body) commentCounter = collections.Counter(allWordsComment) eleLen = sum(commentCounter.values()) # There may be cases with no valid words found if (eleLen > 0): commentsTFIDF[i[0]][c.comment_id] = {w[1]: (ctn, ctn/eleLen*w[4]) for w,ctn in commentCounter.items()} # Log progresses idx+=1 if (idx % 500 == 0): print("Processing... (", idx/blogCount*100, " %)") # Loop all the data and insert into the db titleTFIDFLen = len(blogsTitleTFIDF) idx = 0 for blog_id,titleWords in blogsTitleTFIDF.items(): for word_id,titleTfidf in titleWords.items(): cur2.execute("INSERT INTO blogs_title_tf_idf VALUES(" + str(blog_id) + ", " + str(word_id) + ", " + str(titleTfidf[0]) + ", " + str(titleTfidf[1]) + ")") # Log progresses idx += 1 if (idx % 500 == 0): print("Processing - Blog Titles ... (", idx/titleTFIDFLen*100, " %)") # Loop all the data and insert into the db blogTFIDFLen = len(blogsTFIDF) idx = 0 for blog_id,blogWords in blogsTFIDF.items(): for word_id,blogTfidf in blogWords.items(): cur2.execute("INSERT INTO blogs_tf_idf VALUES(" + str(blog_id) + ", " + str(word_id) + ", " + str(blogTfidf[0]) + ", " + str(blogTfidf[1]) + ")") # Log progresses idx += 1 if (idx % 500 == 0): print("Processing - Blogs ... (", idx/blogTFIDFLen*100, " %)") # Loop all the comments and insert into the db commentTFIDFLen = len(commentsTFIDF) idx = 0 for blog_id,comments in commentsTFIDF.items(): for comment_id,commentWords in comments.items(): for word_id,commentTfidf in commentWords.items(): cur2.execute("INSERT INTO comments_tf_idf VALUES(" + str(blog_id) + ", " + str(comment_id) + ", " + str(word_id) + ", " + str(commentTfidf[0]) + ", " + str(commentTfidf[1]) + ")") # Log progresses idx += 1 if (idx % 500 == 0): print("Processing - Comments ... (", idx/commentTFIDFLen*100, " %)") self.conn2.commit()
from bs4 import BeautifulSoup, Comment bsinstance = BeautifulSoup(open('ltps_parse.html'), "lxml") #Adds a string to the tags and a comment (can be done separetely): commenttoadd = Comment("Here's the comment my friend") links = bsinstance.find('link') links.append('test1') links.append(commenttoadd) print links """#Insert a string to the tag (works like append except we can choose the position): links = bsinstance.find('link') links.append('test1')# IF i wrote : links.clear() : The contents would be deleted, but not the attrs links.insert(0,'test2') print links """
def build(conn = sqlite3.connect(DB_FILE), conn2 = sqlite3.connect(DB_FILE2), rowLimit = None, segType = 2): ''' Build the dictionary of all the Chinese words and English words. Parameters ==================================== conn `sqlite3.Connection` - A SQLite connection object for the data source. Default as the a new connection to the global DB_FILE databse file. conn2 `sqlite3.Connection` - A SQLite connection object for the word dictionary. Default as the a new connection to the global DB_FILE2 databse file. rowLimit `int` - The limit row count of blogs to return. segType `int` - 0: by characters; 1: by characters, but remove english words; 2: by jieba Returns ==================================== `WordDict - A dictionary object for the connection of currently building dictionary. ''' cur = conn.cursor() # Count the number of blogs and collect all the blog ids if (rowLimit is None): cur.execute("SELECT COUNT(blog_id) FROM blogs" + ("" if rowLimit is None else (" LIMIT " + str(rowLimit)))) blogCount = cur.fetchall()[0][0] else: blogCount = rowLimit # Create dictionary table in the new db cur2 = conn2.cursor() print("DB Initiation - Creating dictionary table") cur2.execute('''DROP TABLE IF EXISTS word_dict''') cur2.execute('''DROP TABLE IF EXISTS blogs_tf_idf''') cur2.execute('''DROP TABLE IF EXISTS blogs_title_tf_idf''') cur2.execute('''DROP TABLE IF EXISTS comments_tf_idf''') conn2.commit() cur2.execute('''CREATE TABLE word_dict (word TEXT, id INTEGER, count INTEGER, freq FLOAT, idf FLOAT, PRIMARY KEY(id))''') conn2.commit() wordDict = WordDict(conn, conn2, segType=segType, rowLimit=rowLimit); print("DB Initiation - Loop Entries") # Select the title and blog ids form all the blogs allEntries = cur.execute("SELECT blog_id,title,body FROM blogs" + ("" if rowLimit is None else (" LIMIT " + str(rowLimit)))) wordCount = dict() idx = 0 wordDict.initalCorpusCount() corpusCount = wordDict.corpusCount # Loop all the blogs for dictionary preparation for i in allEntries: # Segment the title and push into the counter allWordsTitle = WordDict.segment(i[1], segType = segType) wordsTitle = set(allWordsTitle) for w in wordsTitle: wordCount[w] = wordCount.setdefault(w, 0) + 1 # Segment the body and push into the counter allWordsBody = WordDict.segment(i[2], segType = segType) wordsBody = set(allWordsBody) for w in wordsBody: wordCount[w] = wordCount.setdefault(w, 0) + 1 # Get the comments and push all the words comments = Comment.getFromDB(i[0]) for c in comments: allWordsComment = WordDict.segment(c.body, segType = segType) wordsComment = set(allWordsComment) for w in wordsComment: wordCount[w] = wordCount.setdefault(w, 0) + 1 # Log progresses idx+=1 if (idx % 500 == 0): print("Processing... (", idx/blogCount*100, " %)") # Loop all the words and insert into the db wordCountLen = len(wordCount); for idx,w in enumerate(wordCount): line = "INSERT INTO word_dict VALUES('" + w.replace("'","''") + "', " + str(idx) + ", " + str(wordCount[w]) + ", " + str(wordCount[w]/corpusCount) + ", " + str(math.log(corpusCount/wordCount[w])) + ")" cur2.execute(line) if (idx % 500 == 0): print("Insertion... (", idx/wordCountLen*100, " %)") conn2.commit() return wordDict
""" doxygen2qtcreator.py : This script scans for documented classes inside Doxygen 'html' directory and inserts markers used by Qt Creator to generate the tooltip when hovering over a class or method name. It uses BeautifulSoup4 to parse and modify the html files. """ from __future__ import print_function import os, sys import argparse from bs4 import BeautifulSoup, Comment # Qt Creator markers class_brief_start = lambda classname: Comment(" $$$ %s-brief " % classname) class_brief_end = lambda classname: Comment(" @@@%s " % classname) method_start = lambda methodname, signature: Comment(" $$$%s[overload1]$$$%s " % (methodname, signature)) method_end = lambda methodname: Comment(" @@@%s " % methodname) def insert_class_markers(soup): """ Inserts Qt Creator markers around the class brief paragraph.""" # look for class name in a div like <div class="title">Namespace::MyClass Class Reference</div> title_div = soup.find("div", "title") if not title_div: raise ValueError( "The following div was not found : <div class='title'>...<div>")
''' text = 'You are viewing information archived from Mozilla.org on %s.' % ( datetime.utcnow().strftime("%Y-%m-%d")) # process every file for filename in locate("*.html", args.directory): with open(filename, "r") as f: soup = BeautifulSoup(f) if len(soup.select('#archived')) == 0: print 'Processing %s' % (filename) # get rid of search form for s in soup.select('#quick-search'): s.replace_with(Comment('search removed')) # add styles for notification block style = soup.new_tag('style', type='text/css') style.append(css) soup.head.append(style) # add notification block div = soup.new_tag('div', id='archived') div.append(text) soup.body.insert(0, div) with open(filename, "w") as f: f.write(str(soup))