def video_search(request): video_response = requests.get(request) bs_videos = BeautifulSoup(video_response.text, 'html.parser') meta, metaEng = makeHTMLTags("meta") img_meta = meta.copy().setParseAction( withAttribute(('property', 'og:image'))) for img in img_meta.searchString(bs_videos): content = img.content video_trailer_id = content.split("/")[-2] video_trailer_url = "http://videofarm.daum.net/controller/video/viewer/Video.html?vid={}&play_loc=daum_movie&autoplay=true".format( video_trailer_id) return video_trailer_url
def has_ssl_disabled(apphostconf_dest: str, exclude: list = None) -> bool: """ Check if SSL is disabled in ``ApplicationHost.config``. Search for access tag in security section in an ``ApplicationHost.config`` source file or package. :param apphostconf_dest: Path to an ``ApplicationHost.config`` source file or package. :param exclude: Paths that contains any string from this list are ignored. """ tk_tag_s, _ = makeXMLTags('security') tk_access, _ = makeXMLTags('access') tag_no_comm = tk_access.ignore(htmlComment) tk_access_none = copy(tag_no_comm) tk_access_none.setParseAction(withAttribute(sslFlags='None')) result = False try: sec_tag = lang.check_grammar(tk_tag_s, apphostconf_dest, LANGUAGE_SPECS, exclude) if not sec_tag: show_unknown('Not files matched', details=dict(code_dest=apphostconf_dest)) return False except FileNotFoundError: show_unknown('File does not exist', details=dict(code_dest=apphostconf_dest)) return False access_tags = {} none_sslflags = {} for code_file, val in sec_tag.items(): access_tags.update( lang.block_contains_grammar(tk_access, code_file, val['lines'], _get_block)) none_sslflags.update( lang.block_contains_grammar(tk_access_none, code_file, val['lines'], _get_block)) if not access_tags or none_sslflags: show_open('SSL is disabled', details=dict( matched=access_tags if access_tags else none_sslflags)) result = True else: show_close('SSL is enabled', details=dict(file=apphostconf_dest, fingerprint=get_sha256(apphostconf_dest))) return result
def is_header_x_powered_by_present(webconf_dest: str, exclude: list = None) -> bool: """ Search for X-Powered-By headers in a Web.config source file or package. :param webconf_dest: Path to a Web.config source file or package. :param exclude: Paths that contains any string from this list are ignored. """ tk_tag_s, _ = makeXMLTags('customHeaders') tk_add_tag, _ = makeXMLTags('add') tk_clear_tag, _ = makeXMLTags('clear') tk_remove_tag, _ = makeXMLTags('remove') tk_remove_tag.setParseAction(withAttribute(name='X-Powered-By')) tk_child_tag = MatchFirst( [Suppress(tk_add_tag), Suppress(tk_clear_tag), tk_remove_tag]) result = False try: custom_headers = lang.check_grammar(tk_tag_s, webconf_dest, LANGUAGE_SPECS, exclude) if not custom_headers: show_unknown('Not files matched', details=dict(code_dest=webconf_dest)) return False except FileNotFoundError: show_unknown('File does not exist', details=dict(code_dest=webconf_dest)) return False tk_rem = Suppress(tk_tag_s) + OneOrMore(tk_child_tag) vulns = {} for code_file, val in custom_headers.items(): vulns.update( lang.block_contains_empty_grammar(tk_rem, code_file, val['lines'], _get_block)) if vulns: show_open('Header "X-Powered-By" is present', details=dict(matched=vulns, total_lines=len(custom_headers))) result = True else: show_close('Header "X-Powered-By" is not present', details=dict(file=webconf_dest, fingerprint=get_sha256(webconf_dest))) return result
def has_debug_enabled(webconf_dest: str, exclude: list = None) -> bool: """ Check if debug flag is enabled in Web.config. Search for debug tag in compilation section in a Web.config source file or package. :param webconf_dest: Path to a Web.config source file or package. :param exclude: Paths that contains any string from this list are ignored. """ tk_tag_s, _ = makeXMLTags('system.web') tk_compilation, _ = makeXMLTags('compilation') tag_no_comm = tk_compilation.ignore(htmlComment) tk_comp_debug = copy(tag_no_comm) tk_comp_debug.setParseAction(withAttribute(debug='true')) result = False try: sysweb_tag = lang.check_grammar(tk_tag_s, webconf_dest, LANGUAGE_SPECS, exclude) if not sysweb_tag: show_unknown('Not files matched', details=dict(code_dest=webconf_dest)) return False except FileNotFoundError: show_unknown('File does not exist', details=dict(code_dest=webconf_dest)) return False debug_tags = {} for code_file, val in sysweb_tag.items(): debug_tags.update( lang.block_contains_grammar(tk_comp_debug, code_file, val['lines'], _get_block)) if debug_tags: show_open('Debug is enabled', details=dict(matched=debug_tags, total_lines=len(sysweb_tag))) result = True else: show_close('Debug is disabled', details=dict(file=webconf_dest, fingerprint=get_sha256(webconf_dest))) return result
def not_custom_errors(webconf_dest: str, exclude: list = None) -> bool: """ Check if customErrors flag is set to off in Web.config. CWE-12: ASP.NET Misconfiguration: Missing Custom Error Page :param webconf_dest: Path to a Web.config source file or package. :param exclude: Paths that contains any string from this list are ignored. """ tk_tag_s, _ = makeXMLTags('system.web') tk_custom_errors, _ = makeXMLTags('customErrors') tag_no_comm = tk_custom_errors.ignore(htmlComment) tk_comp_custom_errors = copy(tag_no_comm) tk_comp_custom_errors.setParseAction(withAttribute(mode='Off')) result = False try: sysweb_tag = lang.check_grammar(tk_tag_s, webconf_dest, LANGUAGE_SPECS, exclude) if not sysweb_tag: show_unknown('Not files matched', details=dict(code_dest=webconf_dest)) return False except FileNotFoundError: show_unknown('File does not exist', details=dict(code_dest=webconf_dest)) return False vulns = {} for code_file, val in sysweb_tag.items(): vulns.update( lang.block_contains_grammar(tk_comp_custom_errors, code_file, val['lines'], _get_block)) if vulns: show_open('Custom errors are not enabled', details=dict(matches=vulns, total_lines=len(sysweb_tag))) result = True else: show_close('Custom errors are enabled', details=dict(file=webconf_dest, fingerprint=get_sha256(webconf_dest))) return result
import urllib year = '2014' conn = urllib.urlopen('http://www.boxofficemojo.com/yearly/chart/?yr=' + year + '&p=.htm') """ looking for this recurring pattern: <td valign="top" tdalign="center">00-03</td> <td valign="top">.50</td> <td valign="top">.50</td> and want a dict with keys 0, 1, 2, and 3 all with values (.50,.50) """ td,tdend = makeHTMLTags("td") keytd = td.copy().setParseAction(withAttribute(tdalign="center")) td,tdend,keytd = map(Suppress,(td,tdend,keytd)) # realnum = Regex(r'1?\.\d+').setParseAction(lambda t:float(t[0])) # integer = Regex(r'\d{1,3}').setParseAction(lambda t:int(t[0])) DASH = Suppress('-') # build up an expression matching the HTML bits above entryExpr = (keytd + tdend + Group(2*(td + tdend))("vals")) # search the input HTML for matches to the entryExpr expression, and build up lookup dict lookup = {} for entry in entryExpr.searchString(conn): for i in range(entry.start, entry.end+1): lookup[i] = tuple(entry.vals)
import argparse import jsonpickle import json import markdown import re from pygments import highlight from pygments.lexers import guess_lexer from pygments.formatters import HtmlFormatter from pyparsing import makeHTMLTags, replaceWith, withAttribute cod = "../frontend/node_modules/.bin/cod" spanOpen, spanClose = makeHTMLTags("span") emptySpans = spanOpen.copy().setParseAction(withAttribute(empty=True)) removeSpans = emptySpans | spanOpen + spanClose removeSpans.setParseAction(replaceWith(" ")) extensions = ['.less', '.css', '.sass', '.scss'] markup_blocks = {} formatter = HtmlFormatter(cssclass='source-highlight') def highlight_source(source): if not source: return '' lexer = guess_lexer(source) return highlight(source, lexer, formatter) def add_markup_block(block):
def data_scrape(master_list_of_links): prefix = 'http://wwww.yelp.com' big_list = [] for i in range(len(master_list_of_links)): time_between_big_links = randint(between_big_links_lower_bound, between_big_links_upper_bound) big_link = prefix + master_list_of_links[i] print big_link print "Scrape initiated" soup = link_opener(big_link) street = soup.find_all("span", itemprop="streetAddress") locality = soup.find_all("span", itemprop="addressLocality") state = soup.find_all("span", itemprop="addressRegion") zip_code = soup.find_all("span", itemprop="postalCode") phone = soup.find_all("span", class_="biz-phone") suffix = '?start=' # review_count specifies how many search pages of reviews you will crawl through. This is set to go through at # most 320 reviews review_count = ['0', '40', '80', '120', '160', '200', '240', '280', '320'] for j in review_count: time_between_review_pages = randint(between_review_pages_lower_bound, between_review_pages_upper_bound) print "processing..." new_link = big_link + suffix + j soup = link_opener(new_link) review_content = soup.find_all("div", class_="review-content") if not review_content: break meta_date = makeHTMLTags('meta')[0] meta_date.setParseAction(withAttribute(itemprop="datePublished")) meta_rating = makeHTMLTags('meta')[0] meta_rating.setParseAction(withAttribute(itemprop="ratingValue")) for k in review_content: indiv_list = [big_link] if not street: indiv_list.append("Missing") else: indiv_list.append(street[0].text) if not locality: indiv_list.append("Missing") else: indiv_list.append(locality[0].text) if not state: indiv_list.append("DC") else: indiv_list.append(state[0].text) if not zip_code: indiv_list.append("Missing") else: indiv_list.append(zip_code[0].text) if not phone: indiv_list.append("Missing") else: indiv_list.append(phone[0].text.strip()) date = next(meta_date.scanString(k))[0] indiv_list.append(date.content) stars = next(meta_rating.scanString(k))[0] indiv_list.append(stars.content) indiv_list.append(k.p.text.encode("utf-8")) big_list.append(indiv_list) time.sleep(time_between_review_pages) print "Scrape complete!" time.sleep(time_between_big_links) print "" return big_list
import urllib year = '2014' conn = urllib.urlopen('http://www.boxofficemojo.com/yearly/chart/?yr=' + year + '&p=.htm') """ looking for this recurring pattern: <td valign="top" tdalign="center">00-03</td> <td valign="top">.50</td> <td valign="top">.50</td> and want a dict with keys 0, 1, 2, and 3 all with values (.50,.50) """ td, tdend = makeHTMLTags("td") keytd = td.copy().setParseAction(withAttribute(tdalign="center")) td, tdend, keytd = map(Suppress, (td, tdend, keytd)) # realnum = Regex(r'1?\.\d+').setParseAction(lambda t:float(t[0])) # integer = Regex(r'\d{1,3}').setParseAction(lambda t:int(t[0])) DASH = Suppress('-') # build up an expression matching the HTML bits above entryExpr = (keytd + tdend + Group(2 * (td + tdend))("vals")) # search the input HTML for matches to the entryExpr expression, and build up lookup dict lookup = {} for entry in entryExpr.searchString(conn): for i in range(entry.start, entry.end + 1): lookup[i] = tuple(entry.vals)
# # withAttribute.py # Copyright, 2007 - Paul McGuire # # Simple example of using withAttribute parse action helper # to define # import pyparsing as pp data = """\ <td align=right width=80><font size=2 face="New Times Roman,Times,Serif"> 49.950 </font></td> <td align=left width=80><font size=2 face="New Times Roman,Times,Serif"> 50.950 </font></td> <td align=right width=80><font size=2 face="New Times Roman,Times,Serif"> 51.950 </font></td> """ td, tdEnd = pp.makeHTMLTags("TD") font, fontEnd = pp.makeHTMLTags("FONT") realNum = pp.pyparsing_common.real NBSP = pp.Literal(" ") patt = td + font + NBSP + realNum("value") + NBSP + fontEnd + tdEnd # always use addParseAction when adding withAttribute as a parse action to a start tag td.addParseAction(pp.withAttribute(align="right", width="80")) for s in patt.searchString(data): print(s.value)
def movie_search(keyword, daum_id=None): r = requests.get( "https://apis.daum.net/contents/movie?apikey={}&q={}&output=json". format(settings.DAUM_API_KEY, keyword)) movie_search = r.json() movies_search = [] num_of_movies = movie_search.get("channel").get("totalCount") for num in range(num_of_movies): img_url = movie_search.get("channel").get("item")[int(num)].get( "thumbnail")[0].get("content") # 이미지 사이즈 (S M L) image_split = img_url.rsplit('/', 5) index = 4 replacement = ['R200x0.q99', 'R500x0.q99', 'R700x0.q99'] movie_img_url = [] for nums in range(3): image_split[index] = replacement[nums] movie_img_url.append('/'.join(image_split)) title_link = movie_search.get("channel").get("item")[int(num)].get( "title")[0].get("link") daum_id = re.findall(r'\d+', title_link) title_kor = movie_search.get("channel").get("item")[int(num)].get( "title")[0].get("content") title_eng = movie_search.get("channel").get("item")[int(num)].get( "eng_title")[0].get("content") created_year = movie_search.get("channel").get("item")[int(num)].get( "year")[0].get("content") run_time = movie_search.get("channel").get("item")[int(num)].get( "open_info")[2].get("content") grade = movie_search.get("channel").get("item")[int(num)].get( "open_info")[1].get("content") synopsis = movie_search.get("channel").get("item")[int(num)].get( "story")[0].get("content") photo_list = [] count = 1 while True: try: photos = movie_search.get("channel").get("item")[int(num)].get( "photo{}".format(count)).get("content") photo_list.append(photos) count += 1 except: break resized_photo_url = [] for image in photo_list: image_split = image.rsplit('/', 5) index = 4 replacement = ['R200x0.q99', 'R500x0.q99', 'R700x0.q99'] each_movie_photo_url = [] for nums in range(3): image_split[index] = replacement[nums] each_movie_photo_url.append('/'.join(image_split)) resized_photo_url.append(each_movie_photo_url) count = 0 nation_list = [] while True: try: nations = movie_search.get("channel").get("item")[int( num)].get("nation")[count].get("content") nation_list.append(nations) count += 1 except: break count = 0 genre_list = [] while True: try: genres = movie_search.get("channel").get("item")[int(num)].get( "genre")[count].get("content") genre_list.append(genres) count += 1 except: break director_info = [] actor_info = [] try: title_link = movie_search.get("channel").get("item")[int(num)].get( "title")[0].get("link") response = requests.get(title_link) bs = BeautifulSoup(response.text, "html.parser") count = 0 while True: used_link = bs.select("ul.list_join li")[count] # 역할 actor_role = used_link.select('span.txt_join')[0].text if "감독" in actor_role: name_kor = used_link.select('em.emph_point')[0].text name_kor_eng = used_link.select('strong.tit_join')[0].text len_of_name_kor = len(name_kor) + 1 # 영문 이름 name_eng = name_kor_eng[len_of_name_kor:] a_tag = used_link.findAll( 'a', attrs={'href': re.compile("/person/")})[0] # 배우 아이디 actor_id = re.findall(r'\d+', a_tag['href']) img_tag = used_link.select("img")[0] # 배우 사진 profile_url = img_tag['src'] director_info.append({ 'daum_id': actor_id, 'name_eng': name_eng, 'name_kor': name_kor, 'profile_url': profile_url }) count += 1 else: name_kor = used_link.select('em.emph_point')[0].text name_kor_eng = used_link.select('strong.tit_join')[0].text len_of_name_kor = len(name_kor) + 1 # 영문 이름 name_eng = name_kor_eng[len_of_name_kor:] a_tag = used_link.findAll( 'a', attrs={'href': re.compile("/person/")})[0] # 배우 아이디 actor_id = re.findall(r'\d+', a_tag['href']) img_tag = used_link.select("img")[0] # 배우 사진 profile_url = img_tag['src'] actor_info.append({ 'daum_id': actor_id, 'name_eng': name_eng, 'name_kor': name_kor, 'profile_url': profile_url, 'character_name': actor_role }) count += 1 except: pass video_list = [] count = 0 while True: try: videos = movie_search.get("channel").get("item")[int(num)].get( "video")[count].get("link") if videos: response_videos = requests.get(videos) bs_videos = BeautifulSoup(response_videos.text, "html.parser") meta, metaEnd = makeHTMLTags("meta") img_meta = meta.copy().setParseAction( withAttribute(('property', 'og:image'))) img_ref = img_meta for img in img_ref.searchString(bs_videos): content = img.content video_trailer_id = content.split("/")[-2] video_trailer_url = "http://videofarm.daum.net/controller/video/viewer/Video.html?vid={}&play_loc=daum_movie&autoplay=true".format( video_trailer_id) video_list.append(video_trailer_url) count += 1 except: break trailer_link = movie_search.get("channel").get("item")[int(num)].get( "trailer")[0].get("link") if trailer_link: response = requests.get(trailer_link) bs = BeautifulSoup(response.text, "html.parser") meta, metaEnd = makeHTMLTags("meta") img_meta = meta.copy().setParseAction( withAttribute(('property', 'og:image'))) img_ref = img_meta for img in img_ref.searchString(bs): content = img.content trailer_id = content.split("/")[-2] trailer_url = "http://videofarm.daum.net/controller/video/viewer/Video.html?vid={}&play_loc=daum_movie&autoplay=true".format( trailer_id) movies_search.append({ 'title_kor': title_kor, 'title_eng': title_eng, 'nation_list': nation_list, # 'created_year': created_year, 'img_url': movie_img_url, 'run_time': run_time, 'grade': grade, 'director_info': director_info, 'actor_info': actor_info, 'genre_list': genre_list, # 'synopsis': synopsis, # 'photo_list': photo_list, # 'video_list': video_list, }) if daum_id: for genres in genre_list: try: genre = Genre.objects.create(genre=genres, ) except: genre = Genre.objects.filter(genre=genres) try: grade = Grade.objects.create(grade=grade, ) except: grade = Grade.objects.get(grade=grade) for nations in nation_list: try: nation = MakingCountry.objects.create( making_country=nations, ) except: pass nation = MakingCountry.objects.filter( making_country=nations) movie = Movie.objects.create( daum_id=daum_id[0], title_kor=title_kor, title_eng=title_eng, created_year=created_year, synopsis=synopsis, grade=grade, run_time=run_time, img_url=movie_img_url, ) for actor in actor_info: actors = Actor.objects.get_or_create( daum_id=actor['daum_id'][0], name_eng=actor['name_eng'], name_kor=actor['name_kor'], profile_url=actor['profile_url']) movie_actor = MovieActor.objects.get_or_create( movie=movie, actor=actors[0], character_name=actor['character_name']) for directors in director_info: director = Director.objects.get_or_create( daum_id=directors['daum_id'][0], name_eng=directors['name_eng'], name_kor=directors['name_kor'], profile_url=directors['profile_url']) for photo in resized_photo_url: try: movie_image = MovieImages.objects.create( movie=movie, url=photo, ) except: pass specific_movie = Movie.objects.get(daum_id=daum_id[0]) for genre in genre_list: g, created = Genre.objects.get_or_create(genre=genre) specific_movie.genre.add(g) for nation in nation_list: n, created = MakingCountry.objects.get_or_create( making_country=nation) specific_movie.making_country.add(n) for director in director_info: d, created = Director.objects.get_or_create( daum_id=director['daum_id'][0], name_eng=director['name_eng'], name_kor=director['name_kor'], profile_url=director['profile_url']) specific_movie.director.add(d) return movies_search
#!/usr/bin/python from pyparsing import makeHTMLTags,withAttribute,Suppress,Regex,Group import os.path """ looking for this recurring pattern: <td valign="top" bgcolor="#FFFFCC">0-3</td> <td valign="top">.50</td> <td valign="top">.50</td> and want a dict with keys 0, 1, 2, and 3 all with values (.50,.50) """ td,tdend = makeHTMLTags("td") keytd = td.copy().setParseAction(withAttribute(bgcolor="#FFFFCC")) td,tdend,keytd = map(Suppress,(td,tdend,keytd)) realnum = Regex(r'1?\.\d+').setParseAction(lambda t:float(t[0])) integer = Regex(r'\d{1,3}').setParseAction(lambda t:int(t[0])) DASH = Suppress('-') # build up an expression matching the HTML bits above entryExpr = (keytd + integer("start") + DASH + integer("end") + tdend + Group(2*(td + realnum + tdend))("vals")) # search the input HTML for matches to the entryExpr expression, and build up lookup dict lookup = {} for entry in entryExpr.searchString(open(os.path.dirname(__file__) + '../../fide.html').read()): for i in range(entry.start, entry.end+1): lookup[i] = tuple(entry.vals) # print the first column of the dictionary to a text file
def lr1(): import urllib.request import urllib.parse # import requests from pyparsing import makeHTMLTags, SkipTo, withAttribute from prettytable import PrettyTable print("Parsing https://www.worldcoinindex.com/") url = 'https://www.worldcoinindex.com' req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) resp = urllib.request.urlopen(req) respData = str(resp.read()) resp.close() tbody_Start, tbody_End = makeHTMLTags('tbody') tbody = tbody_Start + SkipTo(tbody_End)("body") + tbody_End tbody_string = "" for tokens, start, end in tbody.scanString(respData): tbody_string = tbody_string + tokens.body # print(tbody_string) # creating a list for bitcoin names btc = [] # parsing bitcoin names h1_Start, h1_End = makeHTMLTags('h1') h1_body = h1_Start + SkipTo(h1_End)("body") + h1_End bitcoin_name = "" for tokens, start, end in h1_body.scanString(tbody_string): bitcoin_name = bitcoin_name + "\n" + tokens.body # getting rid of <span> span_start, span_end = makeHTMLTags("span") span_body = span_start + SkipTo(span_start | span_end)("body") for tokens, start, end in span_body.scanString(bitcoin_name): btc.append(tokens.body) # creating a list for bitcoin prices prices = [] # parsing bitcoin prices price_start, price_end = makeHTMLTags('td') price_td = price_start.setParseAction( withAttribute(**{"class": "number pricekoers lastprice"})) price_body = price_td + SkipTo(price_start | price_end)("body") price_string = "" for tokens, start, end in price_body.scanString(respData): price_string = price_string + "\n" + tokens.body # getting rid of <span> span_class = span_start.setParseAction(withAttribute(**{"class": "span"})) span_body = span_class + SkipTo(span_class | span_end)("body") for tokens, start, end in span_body.scanString(price_string): prices.append(tokens.body) # print(prices) # generating PrettyTable t = PrettyTable() t.field_names = [" ", "Name", "Resent Price"] i = 0 for x in btc: t.add_row([i + 1, x, prices[i]]) i = i + 1 t.align["Name"] = "c" t.align["Recent Price"] = "c" print(t) # saving data f = open('logs.txt', 'w') f.writelines(str(t)) f.close()