def _has_attributes(filename: str, tag: str, attrs: dict) -> bool: """ Check ``HTML`` attributes` values. This method checks whether the tag (``tag``) inside the code file (``filename``) has attributes (``attr``) with the specific values. :param filename: Path to the ``HTML`` source. :param tag: ``HTML`` tag to search. :param attrs: Attributes with values to search. :returns: True if attribute set as specified, False otherwise. """ with open(filename, 'r', encoding='latin-1') as handle: html_doc = handle.read() tag_s, _ = makeHTMLTags(tag) tag_expr = tag_s result = False for expr in tag_expr.searchString(html_doc): for attr, value in attrs.items(): try: value.parseString(getattr(expr, attr)) result = True except ParseException: result = False break if result: break return result
def getBoldUrls(lines=[],sub=0): abstart,abend = pyparsing.makeHTMLTags('B') grammer2 = abstart + pyparsing.SkipTo(abend) + abend.suppress() for x1,x2,x3 in grammer2.scanString(''.join(lines)): print x1 print x2 print x3
def get_pmid_from_summary(raw_xml): id_start, id_end = pyparsing.makeHTMLTags("Id") id_pattern = id_start.suppress() + pyparsing.Word(pyparsing.nums, min=1)("pmid") + id_end.suppress() try: pmids = id_pattern.searchString(raw_xml).asList()[0] except IndexError: pmids = [] return pmids
def _extract_meta_options(self): """Fill options dictionary with metatags of template.""" meta_start, meta_end = makeHTMLTags("meta") for token, start, end in meta_start.scanString(self.template): if ":" in token.name: value = token.content if token.name.startswith('if:'): value = bool(int(value)) self.options[token.name] = value
def get_pmid_from_summary(raw_xml): id_start, id_end = pyparsing.makeHTMLTags("Id") id_pattern = id_start.suppress() + pyparsing.Word( pyparsing.nums, min=1)("pmid") + id_end.suppress() try: pmids = id_pattern.searchString(raw_xml).asList()[0] except IndexError: pmids = [] return pmids
def _extract_meta_options(self): """Fill options dictionary with metatags of template.""" meta_start, meta_end = makeHTMLTags("meta") for token, start, end in meta_start.scanString(self.template): if ":" in token.name: value = token.content if token.name.startswith('if:'): value = bool(int(value)) key = token.name.replace('if:', '') key = ''.join(word.capitalize() for word in re.split(r'\s+', key)) self.options[token.name] = value
def getUrls(lines=[]): grammer = '' astart,aend = pyparsing.makeHTMLTags('a') grammer = astart + pyparsing.SkipTo(aend) + aend.suppress() urls = [] for x1,x2,x3 in grammer.scanString(''.join(lines)): urls.append(str(x1[1][1])) return urls
def video_search(request): video_response = requests.get(request) bs_videos = BeautifulSoup(video_response.text, 'html.parser') meta, metaEng = makeHTMLTags("meta") img_meta = meta.copy().setParseAction( withAttribute(('property', 'og:image'))) for img in img_meta.searchString(bs_videos): content = img.content video_trailer_id = content.split("/")[-2] video_trailer_url = "http://videofarm.daum.net/controller/video/viewer/Video.html?vid={}&play_loc=daum_movie&autoplay=true".format( video_trailer_id) return video_trailer_url
def pyparsing(): from pyparsing import makeHTMLTags, SkipTo, htmlComment import urllib serverListPage = \ urllib.urlopen( "http://agander.home/" ) htmlText = serverListPage.read() serverListPage.close() aStart, aEnd = makeHTMLTags("A") link = aStart + SkipTo(aEnd).setResultsName("link") + aEnd link.ignore(htmlComment) for toks, start, end in link.scanString(htmlText): print('{} -> {}'.format(toks.link, toks.startA.href))
def getUrls(lines=[], sub=0): grammer = '' astart, aend = pyparsing.makeHTMLTags('a') grammer = astart + pyparsing.SkipTo(aend) + aend.suppress() urls = [] for x1, x2, x3 in grammer.scanString(''.join(lines)): if sub: if len(x1) == 5: print x1[4] if len(x1) == 6: urls.append(str(x1[5])) else: urls.append(str(x1[1][1])) return urls
def getEmailUrls(lines=[]): grammer = '' astart, aend = pyparsing.makeHTMLTags('a') grammer = astart + pyparsing.SkipTo(aend) + aend.suppress() urls = [] for x1, x2, x3 in grammer.scanString(''.join(lines)): if len(x1) == 5: urls.append(x1[4]) for eachUrls in urls: if eachUrls.find('alt="Search ') > 0: data = eachUrls[eachUrls.find('alt="Search ') + len('alt="Search '):len(eachUrls) - 2] print data return urls
def get_linked_articles(self, wikipage): # Define the pyparsing grammar for a URL, that is: # URLlink ::= <a href= URL>linkText</a> # URL ::= doubleQuotedString | alphanumericWordPath # Note that whitespace may appear just about anywhere in the link. Note also # that it is not necessary to explicitly show this in the pyparsing grammar; by default, # pyparsing skips over whitespace between tokens. linkOpenTag, linkCloseTag = makeHTMLTags("a") link = linkOpenTag + SkipTo(linkCloseTag).setResultsName( "body") + linkCloseTag.suppress() # Go get some HTML with some links in it. # serverListPage = urllib.urlopen( "http://de.wikipedia.org/w/index.php?title=Hauptseite&redirect=no" ) # htmlText = serverListPage.read() # serverListPage.close() # # print htmlText # # scanString is a generator that loops through the input htmlText, and for each # match yields the tokens and start and end locations (for this application, we are # not interested in the start and end values). articles = set() for toks, strt, end in link.scanString(wikipage): if (len(toks.startA.href) != 0 and #remove empty links toks.startA.href.find('#') == -1 and #remove anchors toks.startA.href.find(':') == -1 and #remove wikipedia special links toks.startA.href.find('?') == -1 and #remove wikipedia special links toks.startA.href.find('Hauptseite') == -1): #remove link to main page if (toks.body == "Artikel"): articlename = toks.startA.href.lstrip( '/wiki/' ) #save real(!) article name, so we don't get confused by redirects else: articles.add(toks.startA.href.lstrip('/wiki/')) #print toks.startA.href,"->",toks.body return (articles, articlename)
def parse(db,url): global add global urls try: if not re.search('^http://',url): url=siteurl+"/"+url url="http://"+url.replace("//","/") request=urllib.request.Request(url) request.add_header('User-Agent', 'Flowgen/1.0 (http://floft.net)') page=urllib.request.urlopen(request) html=page.read().decode("utf-8") page.close() print("Notice: processing {}".format(url)) #get urls linkOpenTag,linkCloseTag = makeHTMLTags("a") link = linkOpenTag + SkipTo(linkCloseTag).setResultsName("body") + linkCloseTag.suppress() for toks,strt,end in link.scanString(html): newurl=toks.startA.href if newurl not in urls and newurl not in visited: if re.search('^(/|http://'+siteurl+')',newurl) and not \ re.search('(jpg|png|flac|mp3|zip|pdf)$',newurl): urls.append(newurl) #get title try: title=re.search('<title>([^<]*)</title>',html).groups()[0] except: title="Untitled" #get text xml=lxml.html.document_fromstring(html.replace(">","> ").replace("<", " <")) text=xml.cssselect('body')[0].text_content().replace("\n"," ").strip() #add to database add.append([time(),title,url,text]) except: print("Error: {} does not load".format(url))
def getUrls(lines=[],sub=0): grammer = '' astart,aend = pyparsing.makeHTMLTags('a') grammer = astart + pyparsing.SkipTo(aend) + aend.suppress() ## grammer2 = abstart + pyparsing.SkipTo(abend) + abend.suppress() ## for x1,x2,x3 in grammer.scanString(''.join(lines)): ## print x1 ## print x2 ## print x3 urls = [] for x1,x2,x3 in grammer.scanString(''.join(lines)): if sub: if len(x1)==6: mailreadstatus = 1 if str(x1[1]).find('db_chkstatus=1')>0 else 0 mailsubject = str(x1[5]) maillink = str(x1[1][1]) nxtSender=True if len(x1)==5 and nxtSender: mailsender = '' if dict(x1).has_key('title'): mailsender = str(x1['title']).replace('Search ','') nxtTime=True nxtSender=False if len(x1)==4 and nxtTime: mailtime = x1[3] urls.append([mailsubject,mailreadstatus,mailsender,mailtime,maillink]) nxtSender=False nxtTime=False mailreadstatus=0 mailsubject = '' mailsender='' mailtime = '' maillink = '' else: urls.append(str(x1[1][1])) nxtSender=False nxtTime=False return urls
def initGrammar(self): L_Equals = Word("=") N_comment = htmlComment() N_name = CharsNotIn("{}|[]") N_simpleText = SkipTo( oneOf(["{{", "|", "[[", "]]", "}}", "'''", "<ref"])) N_elements = Forward() N_apostrofs = QuotedString("'''").setParseAction( lambda s, l, t: {'APOSTROFS': t}) N_link = nestedExpr( opener="[[", closer="]]", content=N_name + Optional("|" + delimitedList(CharsNotIn("[]"), delim="|")) ).setParseAction(self.genLink) N_header = Group(L_Equals + SkipTo("=") + L_Equals).setParseAction( lambda s, l, t: {'HEADER': t}) N_template = Forward() N_key = CharsNotIn("{}|=") # N_value = ZeroOrMore(CharsNotIn("{}|")) + ZeroOrMore(N_template + ZeroOrMore(CharsNotIn("{}|"))).setResultsName('VALUE') N_keyValues = "|" + delimitedList( Group(Optional(N_key) + Optional("=" + N_elements)), delim="|") N_label_content = N_template | ("{{" + OneOrMore("!") + "}}") | CharsNotIn("{}|") N_label = nestedExpr(opener="{", closer="}", content=N_label_content) N_template << nestedExpr( opener="{{", closer="}}", content=N_name + Optional(N_keyValues)).setParseAction(self.genTemplate) ref_start, ref_end = makeHTMLTags("ref") N_named_ref = ref_start + SkipTo(ref_end) + ref_end N_named_ref.setParseAction(lambda s, l, t: {'REF': t}) N_element = N_comment | N_simpleText | N_named_ref | N_apostrofs | N_link | N_header | N_template | N_label # N_ref = nestedExpr( opener="<ref>", closer="</ref>", content=N_elements).setParseAction( lambda s,l,t: {'REF' : t} ) N_elements << ZeroOrMore(N_element) self.N_S = N_elements
def getUseriCalZip(todaysDate, userName, samlResponse, br, logOut): samlResponseText = samlResponse.read() theStart,theEnd = makeHTMLTags("textarea") search = theStart + SkipTo(theEnd)("body")+ theEnd saml_resp_str = search.searchString(samlResponseText)[0].body relay_state_str = search.searchString(samlResponseText)[1].body fileNametoSave = todaysDate + "/" + userName + ".zip" br.select_form(name="acsForm") br["SAMLResponse"] = saml_resp_str br["RelayState"] = relay_state_str try: br.submit() except: print "WARN: trouble downloading cal data for " + userName + ": " + " at " + time.strftime('%H:%M') + ".\n" logOut.flush() time.sleep(60) try: br.submit() except: print "FAIL: can't open cal session for " + userName + ": " + " at " + time.strftime('%H:%M') + ".\n" logOut.flush() else: print "OKAY - second try - retrieving cal data for user " + userName + " at " + time.strftime('%H:%M') + ".\n" logOut.flush() try: br.retrieve('https://www.google.com/calendar/exporticalzip',fileNametoSave) except: print "FAIL: can't open cal session for " + userName + ": " + " at " + time.strftime('%H:%M') + ".\n" logOut.flush() else: print "Retrieving cal data for user " + userName + " at " + time.strftime('%H:%M') + ".\n" logOut.flush() try: br.retrieve('https://www.google.com/calendar/exporticalzip',fileNametoSave) except: print "FAIL: can't download cal data for " + userName + ": " + " at " + time.strftime('%H:%M') + ".\n" logOut.flush()
def lr1(): import urllib.request import urllib.parse # import requests from pyparsing import makeHTMLTags, SkipTo, withAttribute from prettytable import PrettyTable print("Parsing https://www.worldcoinindex.com/") url = 'https://www.worldcoinindex.com' req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) resp = urllib.request.urlopen(req) respData = str(resp.read()) resp.close() tbody_Start, tbody_End = makeHTMLTags('tbody') tbody = tbody_Start + SkipTo(tbody_End)("body") + tbody_End tbody_string = "" for tokens, start, end in tbody.scanString(respData): tbody_string = tbody_string + tokens.body # print(tbody_string) # creating a list for bitcoin names btc = [] # parsing bitcoin names h1_Start, h1_End = makeHTMLTags('h1') h1_body = h1_Start + SkipTo(h1_End)("body") + h1_End bitcoin_name = "" for tokens, start, end in h1_body.scanString(tbody_string): bitcoin_name = bitcoin_name + "\n" + tokens.body # getting rid of <span> span_start, span_end = makeHTMLTags("span") span_body = span_start + SkipTo(span_start | span_end)("body") for tokens, start, end in span_body.scanString(bitcoin_name): btc.append(tokens.body) # creating a list for bitcoin prices prices = [] # parsing bitcoin prices price_start, price_end = makeHTMLTags('td') price_td = price_start.setParseAction( withAttribute(**{"class": "number pricekoers lastprice"})) price_body = price_td + SkipTo(price_start | price_end)("body") price_string = "" for tokens, start, end in price_body.scanString(respData): price_string = price_string + "\n" + tokens.body # getting rid of <span> span_class = span_start.setParseAction(withAttribute(**{"class": "span"})) span_body = span_class + SkipTo(span_class | span_end)("body") for tokens, start, end in span_body.scanString(price_string): prices.append(tokens.body) # print(prices) # generating PrettyTable t = PrettyTable() t.field_names = [" ", "Name", "Resent Price"] i = 0 for x in btc: t.add_row([i + 1, x, prices[i]]) i = i + 1 t.align["Name"] = "c" t.align["Recent Price"] = "c" print(t) # saving data f = open('logs.txt', 'w') f.writelines(str(t)) f.close()
#!/usr/bin/env python # (c) 2016 John Strickler # import requests from pprint import pprint import pyparsing as pp response = requests.get('http://www.python.org') html = response.content link_start, link_end = pp.makeHTMLTags('a') link_content = pp.SkipTo(link_end).setResultsName('link') full_link = link_start + link_content + link_end.suppress() count = 1 for token, _, _ in full_link.scanString(html): if token.href.lower().startswith('http'): print("LINK {}:".format(count)) print("{} ==> {}".format(token.link, token.href)) print(('-' * 60)) count += 1
from pyparsing import makeHTMLTags,SkipTo,htmlComment import urllib.request, urllib.parse, urllib.error serverListPage = urllib.request.urlopen( "http://www.yahoo.com" ) htmlText = serverListPage.read() serverListPage.close() aStart,aEnd = makeHTMLTags("A") link = aStart + SkipTo(aEnd).setResultsName("link") + aEnd link.ignore(htmlComment) for toks,start,end in link.scanString(htmlText): print(toks.link, "->", toks.startA.href)
#http://pyparsing.wikispaces.com/HowToUsePyparsing from pyparsing import Word, alphas, alphanums, nums, Literal, restOfLine, OneOrMore, \ empty, Suppress, replaceWith, Group, Optional from pyparsing import makeHTMLTags, withAttribute, SkipTo from pyparsing import Suppress #Dirty, dirty hack. Move to proper XML soonest #td_start, td_end = makeHTMLTags("td") pm_start, pm_end = makeHTMLTags("Placemark") name_start, name_end = makeHTMLTags("name") desc_start, desc_end = makeHTMLTags("description") poly_start, poly_end = makeHTMLTags("Polygon") ed_start, ed_end = makeHTMLTags("ExtendedData") placemark_set = ( pm_start #+ SkipTo(name_start) + name_start + SkipTo(name_end)("name") + name_end #Note: this might be a CDATA Section + Optional(SkipTo(desc_start) + desc_start + SkipTo(desc_end)("description") + desc_end) #+ SkipTo(desc_start) + desc_start + "<![CDATA[" + SkipTo("]]>")("description") + ']]>' + desc_end + SkipTo(poly_start) + poly_start + SkipTo(poly_end)("polygon") + poly_end + Optional(SkipTo(ed_start) + Group(SkipTo(ed_end) + ed_end)("ext_data")) + SkipTo(pm_end) + pm_end ) def gen_record(text): for data, startloc, endloc in placemark_set.scanString(text): yield data #print data.pid[1], ': ' , data.coords.split()[0].rsplit(',', 1)[0]
def trans_tag(self, ltext, tag, fun): aopen, aclose = pyparsing.makeHTMLTags(tag) a = aopen + pyparsing.SkipTo(aclose).setResultsName("body") + aclose a.setParseAction(fun) ltext = a.transformString(ltext) return ltext
# # htmlStripper.py # # Sample code for stripping HTML markup tags and scripts from # HTML source files. # # Copyright (c) 2006, 2016, Paul McGuire # from contextlib import closing import urllib.request, urllib.parse, urllib.error from pyparsing import (makeHTMLTags, SkipTo, commonHTMLEntity, replaceHTMLEntity, htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith) scriptOpen,scriptClose = makeHTMLTags("script") scriptBody = scriptOpen + SkipTo(scriptClose) + scriptClose commonHTMLEntity.setParseAction(replaceHTMLEntity) # get some HTML targetURL = "http://wiki.python.org/moin/PythonDecoratorLibrary" with closing(urllib.request.urlopen( targetURL )) as targetPage: targetHTML = targetPage.read().decode("UTF-8") # first pass, strip out tags and translate entities firstPass = (htmlComment | scriptBody | commonHTMLEntity | anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML) # first pass leaves many blank lines, collapse these down repeatedNewlines = LineEnd() + OneOrMore(LineEnd()) repeatedNewlines.setParseAction(replaceWith("\n\n")) secondPass = repeatedNewlines.transformString(firstPass)
# # withAttribute.py # Copyright, 2007 - Paul McGuire # # Simple example of using withAttribute parse action helper # to define # import pyparsing as pp data = """\ <td align=right width=80><font size=2 face="New Times Roman,Times,Serif"> 49.950 </font></td> <td align=left width=80><font size=2 face="New Times Roman,Times,Serif"> 50.950 </font></td> <td align=right width=80><font size=2 face="New Times Roman,Times,Serif"> 51.950 </font></td> """ td, tdEnd = pp.makeHTMLTags("TD") font, fontEnd = pp.makeHTMLTags("FONT") realNum = pp.pyparsing_common.real NBSP = pp.Literal(" ") patt = td + font + NBSP + realNum("value") + NBSP + fontEnd + tdEnd # always use addParseAction when adding withAttribute as a parse action to a start tag td.addParseAction(pp.withAttribute(align="right", width="80")) for s in patt.searchString(data): print(s.value)
# # Copyright (c) 2006, 2016, Paul McGuire # from urllib.request import urlopen from pyparsing import ( makeHTMLTags, commonHTMLEntity, replaceHTMLEntity, htmlComment, anyOpenTag, anyCloseTag, LineEnd, replaceWith, ) scriptOpen, scriptClose = makeHTMLTags("script") scriptBody = scriptOpen + scriptOpen.tag_body + scriptClose commonHTMLEntity.setParseAction(replaceHTMLEntity) # get some HTML targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary" with urlopen(targetURL) as targetPage: targetHTML = targetPage.read().decode("UTF-8") # first pass, strip out tags and translate entities firstPass = ((htmlComment | scriptBody | commonHTMLEntity | anyOpenTag | anyCloseTag).suppress().transformString(targetHTML)) # first pass leaves many blank lines, collapse these down repeatedNewlines = LineEnd() * (2, ) repeatedNewlines.setParseAction(replaceWith("\n\n"))
import cgi from pyparsing import makeHTMLTags, SkipTo raw = """<body><div class="shoveler" id="purchaseShvl"> <p>Customers who bought this item also bought</p> <div class="foo"> <span class="bar">Shovel cozy</span> <span class="bar">Shovel rack</span> </div> </div></body>""" def foo(parseResult): parts = [] for token in parseResult: st = '<div id="%s" class="%s">' % \ (cgi.escape(getattr(token, 'id')), cgi.escape(getattr(token, 'class'))) parts.append(st + token.body + token.endDiv) return '\n'.join(parts) start, end = makeHTMLTags('div') anchor = start + SkipTo(end).setResultsName('body') + end res = anchor.searchString(raw) print foo(res)
from pyparsing import makeHTMLTags, withAttribute, Suppress, Regex, Group import urllib year = '2014' conn = urllib.urlopen('http://www.boxofficemojo.com/yearly/chart/?yr=' + year + '&p=.htm') """ looking for this recurring pattern: <td valign="top" tdalign="center">00-03</td> <td valign="top">.50</td> <td valign="top">.50</td> and want a dict with keys 0, 1, 2, and 3 all with values (.50,.50) """ td, tdend = makeHTMLTags("td") keytd = td.copy().setParseAction(withAttribute(tdalign="center")) td, tdend, keytd = map(Suppress, (td, tdend, keytd)) # realnum = Regex(r'1?\.\d+').setParseAction(lambda t:float(t[0])) # integer = Regex(r'\d{1,3}').setParseAction(lambda t:int(t[0])) DASH = Suppress('-') # build up an expression matching the HTML bits above entryExpr = (keytd + tdend + Group(2 * (td + tdend))("vals")) # search the input HTML for matches to the entryExpr expression, and build up lookup dict lookup = {} for entry in entryExpr.searchString(conn): for i in range(entry.start, entry.end + 1): lookup[i] = tuple(entry.vals)
import urllib from pyparsing import makeHTMLTags, SkipTo # read HTML from a web page serverListPage = urllib.urlopen( "http://www.yahoo.com" ) htmlText = serverListPage.read() serverListPage.close() # using makeHTMLTags to define opening and closing tags anchorStart,anchorEnd = makeHTMLTags("a") # compose an expression for an anchored reference anchor = anchorStart + SkipTo(anchorEnd)("body") + anchorEnd # use scanString to scan through the HTML source, extracting # just the anchor tags and their associated body text # (note the href attribute of the opening A tag is available # as an attribute in the returned parse results) for tokens,start,end in anchor.scanString(htmlText): print tokens.body,'->',tokens.href
def movie_search(keyword, daum_id=None): r = requests.get( "https://apis.daum.net/contents/movie?apikey={}&q={}&output=json". format(settings.DAUM_API_KEY, keyword)) movie_search = r.json() movies_search = [] num_of_movies = movie_search.get("channel").get("totalCount") for num in range(num_of_movies): img_url = movie_search.get("channel").get("item")[int(num)].get( "thumbnail")[0].get("content") # 이미지 사이즈 (S M L) image_split = img_url.rsplit('/', 5) index = 4 replacement = ['R200x0.q99', 'R500x0.q99', 'R700x0.q99'] movie_img_url = [] for nums in range(3): image_split[index] = replacement[nums] movie_img_url.append('/'.join(image_split)) title_link = movie_search.get("channel").get("item")[int(num)].get( "title")[0].get("link") daum_id = re.findall(r'\d+', title_link) title_kor = movie_search.get("channel").get("item")[int(num)].get( "title")[0].get("content") title_eng = movie_search.get("channel").get("item")[int(num)].get( "eng_title")[0].get("content") created_year = movie_search.get("channel").get("item")[int(num)].get( "year")[0].get("content") run_time = movie_search.get("channel").get("item")[int(num)].get( "open_info")[2].get("content") grade = movie_search.get("channel").get("item")[int(num)].get( "open_info")[1].get("content") synopsis = movie_search.get("channel").get("item")[int(num)].get( "story")[0].get("content") photo_list = [] count = 1 while True: try: photos = movie_search.get("channel").get("item")[int(num)].get( "photo{}".format(count)).get("content") photo_list.append(photos) count += 1 except: break resized_photo_url = [] for image in photo_list: image_split = image.rsplit('/', 5) index = 4 replacement = ['R200x0.q99', 'R500x0.q99', 'R700x0.q99'] each_movie_photo_url = [] for nums in range(3): image_split[index] = replacement[nums] each_movie_photo_url.append('/'.join(image_split)) resized_photo_url.append(each_movie_photo_url) count = 0 nation_list = [] while True: try: nations = movie_search.get("channel").get("item")[int( num)].get("nation")[count].get("content") nation_list.append(nations) count += 1 except: break count = 0 genre_list = [] while True: try: genres = movie_search.get("channel").get("item")[int(num)].get( "genre")[count].get("content") genre_list.append(genres) count += 1 except: break director_info = [] actor_info = [] try: title_link = movie_search.get("channel").get("item")[int(num)].get( "title")[0].get("link") response = requests.get(title_link) bs = BeautifulSoup(response.text, "html.parser") count = 0 while True: used_link = bs.select("ul.list_join li")[count] # 역할 actor_role = used_link.select('span.txt_join')[0].text if "감독" in actor_role: name_kor = used_link.select('em.emph_point')[0].text name_kor_eng = used_link.select('strong.tit_join')[0].text len_of_name_kor = len(name_kor) + 1 # 영문 이름 name_eng = name_kor_eng[len_of_name_kor:] a_tag = used_link.findAll( 'a', attrs={'href': re.compile("/person/")})[0] # 배우 아이디 actor_id = re.findall(r'\d+', a_tag['href']) img_tag = used_link.select("img")[0] # 배우 사진 profile_url = img_tag['src'] director_info.append({ 'daum_id': actor_id, 'name_eng': name_eng, 'name_kor': name_kor, 'profile_url': profile_url }) count += 1 else: name_kor = used_link.select('em.emph_point')[0].text name_kor_eng = used_link.select('strong.tit_join')[0].text len_of_name_kor = len(name_kor) + 1 # 영문 이름 name_eng = name_kor_eng[len_of_name_kor:] a_tag = used_link.findAll( 'a', attrs={'href': re.compile("/person/")})[0] # 배우 아이디 actor_id = re.findall(r'\d+', a_tag['href']) img_tag = used_link.select("img")[0] # 배우 사진 profile_url = img_tag['src'] actor_info.append({ 'daum_id': actor_id, 'name_eng': name_eng, 'name_kor': name_kor, 'profile_url': profile_url, 'character_name': actor_role }) count += 1 except: pass video_list = [] count = 0 while True: try: videos = movie_search.get("channel").get("item")[int(num)].get( "video")[count].get("link") if videos: response_videos = requests.get(videos) bs_videos = BeautifulSoup(response_videos.text, "html.parser") meta, metaEnd = makeHTMLTags("meta") img_meta = meta.copy().setParseAction( withAttribute(('property', 'og:image'))) img_ref = img_meta for img in img_ref.searchString(bs_videos): content = img.content video_trailer_id = content.split("/")[-2] video_trailer_url = "http://videofarm.daum.net/controller/video/viewer/Video.html?vid={}&play_loc=daum_movie&autoplay=true".format( video_trailer_id) video_list.append(video_trailer_url) count += 1 except: break trailer_link = movie_search.get("channel").get("item")[int(num)].get( "trailer")[0].get("link") if trailer_link: response = requests.get(trailer_link) bs = BeautifulSoup(response.text, "html.parser") meta, metaEnd = makeHTMLTags("meta") img_meta = meta.copy().setParseAction( withAttribute(('property', 'og:image'))) img_ref = img_meta for img in img_ref.searchString(bs): content = img.content trailer_id = content.split("/")[-2] trailer_url = "http://videofarm.daum.net/controller/video/viewer/Video.html?vid={}&play_loc=daum_movie&autoplay=true".format( trailer_id) movies_search.append({ 'title_kor': title_kor, 'title_eng': title_eng, 'nation_list': nation_list, # 'created_year': created_year, 'img_url': movie_img_url, 'run_time': run_time, 'grade': grade, 'director_info': director_info, 'actor_info': actor_info, 'genre_list': genre_list, # 'synopsis': synopsis, # 'photo_list': photo_list, # 'video_list': video_list, }) if daum_id: for genres in genre_list: try: genre = Genre.objects.create(genre=genres, ) except: genre = Genre.objects.filter(genre=genres) try: grade = Grade.objects.create(grade=grade, ) except: grade = Grade.objects.get(grade=grade) for nations in nation_list: try: nation = MakingCountry.objects.create( making_country=nations, ) except: pass nation = MakingCountry.objects.filter( making_country=nations) movie = Movie.objects.create( daum_id=daum_id[0], title_kor=title_kor, title_eng=title_eng, created_year=created_year, synopsis=synopsis, grade=grade, run_time=run_time, img_url=movie_img_url, ) for actor in actor_info: actors = Actor.objects.get_or_create( daum_id=actor['daum_id'][0], name_eng=actor['name_eng'], name_kor=actor['name_kor'], profile_url=actor['profile_url']) movie_actor = MovieActor.objects.get_or_create( movie=movie, actor=actors[0], character_name=actor['character_name']) for directors in director_info: director = Director.objects.get_or_create( daum_id=directors['daum_id'][0], name_eng=directors['name_eng'], name_kor=directors['name_kor'], profile_url=directors['profile_url']) for photo in resized_photo_url: try: movie_image = MovieImages.objects.create( movie=movie, url=photo, ) except: pass specific_movie = Movie.objects.get(daum_id=daum_id[0]) for genre in genre_list: g, created = Genre.objects.get_or_create(genre=genre) specific_movie.genre.add(g) for nation in nation_list: n, created = MakingCountry.objects.get_or_create( making_country=nation) specific_movie.making_country.add(n) for director in director_info: d, created = Director.objects.get_or_create( daum_id=director['daum_id'][0], name_eng=director['name_eng'], name_kor=director['name_kor'], profile_url=director['profile_url']) specific_movie.director.add(d) return movies_search
from pyparsing import makeHTMLTags import urllib # read data from web page url = "https://www.cia.gov/library/"\ "publications/the-world-"\ "factbook/docs/refmaps.html" html = urllib.urlopen(url).read() # define expression for <img> tag imgTag,endImgTag = makeHTMLTags("img") # search for matching tags, and # print key attributes for img in imgTag.searchString(html): print "'%(alt)s' : %(src)s" % img
import yaml import argparse import jsonpickle import json import markdown import re from pygments import highlight from pygments.lexers import guess_lexer from pygments.formatters import HtmlFormatter from pyparsing import makeHTMLTags, replaceWith, withAttribute cod = "../frontend/node_modules/.bin/cod" spanOpen, spanClose = makeHTMLTags("span") emptySpans = spanOpen.copy().setParseAction(withAttribute(empty=True)) removeSpans = emptySpans | spanOpen + spanClose removeSpans.setParseAction(replaceWith(" ")) extensions = ['.less', '.css', '.sass', '.scss'] markup_blocks = {} formatter = HtmlFormatter(cssclass='source-highlight') def highlight_source(source): if not source: return '' lexer = guess_lexer(source) return highlight(source, lexer, formatter)
# Copyright 2004-2010, by Paul McGuire # September, 2010 - updated to more current use of setResultsName, new NIST URL # from pyparsing import (Word, Combine, SkipTo, nums, makeHTMLTags, delimitedList, alphas, alphanums) try: import urllib.request urlopen = urllib.request.urlopen except ImportError: import urllib urlopen = urllib.urlopen integer = Word(nums) ipAddress = Combine(integer + "." + integer + "." + integer + "." + integer) hostname = delimitedList(Word(alphas, alphanums + "-_"), ".", combine=True) tdStart, tdEnd = makeHTMLTags("td") timeServerPattern = (tdStart + hostname("hostname") + tdEnd + tdStart + ipAddress("ipAddr") + tdEnd + tdStart + SkipTo(tdEnd)("loc") + tdEnd) # get list of time servers nistTimeServerURL = "https://tf.nist.gov/tf-cgi/servers.cgi#" serverListPage = urlopen(nistTimeServerURL) serverListHTML = serverListPage.read().decode("UTF-8") serverListPage.close() addrs = {} for srvr, startloc, endloc in timeServerPattern.scanString(serverListHTML): print("{} ({}) - {}".format(srvr.ipAddr, srvr.hostname.strip(), srvr.loc.strip())) addrs[srvr.ipAddr] = srvr.loc
import urllib.request, urllib.parse, urllib.error from pyparsing import makeHTMLTags, SkipTo # read HTML from a web page serverListPage = urllib.request.urlopen( "http://www.yahoo.com" ) htmlText = serverListPage.read() serverListPage.close() # using makeHTMLTags to define opening and closing tags anchorStart,anchorEnd = makeHTMLTags("a") # compose an expression for an anchored reference anchor = anchorStart + SkipTo(anchorEnd)("body") + anchorEnd # use scanString to scan through the HTML source, extracting # just the anchor tags and their associated body text # (note the href attribute of the opening A tag is available # as an attribute in the returned parse results) for tokens,start,end in anchor.scanString(htmlText): print(tokens.body,'->',tokens.href)
from pyparsing import makeHTMLTags,withAttribute,Suppress,Regex,Group import urllib year = '2014' conn = urllib.urlopen('http://www.boxofficemojo.com/yearly/chart/?yr=' + year + '&p=.htm') """ looking for this recurring pattern: <td valign="top" tdalign="center">00-03</td> <td valign="top">.50</td> <td valign="top">.50</td> and want a dict with keys 0, 1, 2, and 3 all with values (.50,.50) """ td,tdend = makeHTMLTags("td") keytd = td.copy().setParseAction(withAttribute(tdalign="center")) td,tdend,keytd = map(Suppress,(td,tdend,keytd)) # realnum = Regex(r'1?\.\d+').setParseAction(lambda t:float(t[0])) # integer = Regex(r'\d{1,3}').setParseAction(lambda t:int(t[0])) DASH = Suppress('-') # build up an expression matching the HTML bits above entryExpr = (keytd + tdend + Group(2*(td + tdend))("vals")) # search the input HTML for matches to the entryExpr expression, and build up lookup dict lookup = {} for entry in entryExpr.searchString(conn): for i in range(entry.start, entry.end+1):
link = tag[0].link description = tag[0].description description = description[0] if description else '' return { 'link': link, 'description': description, } def extract_description(tag): return tag[0].strip() # define grammar dt_start, _ = pp.makeHTMLTags("DT") dd_start, _ = pp.makeHTMLTags("DD") a_start, a_end = pp.makeHTMLTags("A") bookmark_link_tag = pp.Group(a_start + a_start.tag_body("text") + a_end.suppress()) bookmark_link_tag.addParseAction(extract_bookmark_link) bookmark_description_tag = dd_start.suppress() + pp.SkipTo( pp.anyOpenTag | pp.anyCloseTag)("description") bookmark_description_tag.addParseAction(extract_description) bookmark_tag = pp.Group(dt_start + bookmark_link_tag("link") + pp.ZeroOrMore(bookmark_description_tag)("description")) bookmark_tag.addParseAction(extract_bookmark) def parse(html: str) -> [NetscapeBookmark]: matches = bookmark_tag.searchString(html)
# # htmlTableParser.py # # Example of parsing a simple HTML table into a list of rows, and optionally into a little database # # Copyright 2019, Paul McGuire # import pyparsing as pp import urllib.request # define basic HTML tags, and compose into a Table table, table_end = pp.makeHTMLTags("table") thead, thead_end = pp.makeHTMLTags("thead") tbody, tbody_end = pp.makeHTMLTags("tbody") tr, tr_end = pp.makeHTMLTags("tr") th, th_end = pp.makeHTMLTags("th") td, td_end = pp.makeHTMLTags("td") a, a_end = pp.makeHTMLTags("a") # method to strip HTML tags from a string - will be used to clean up content of table cells strip_html = (pp.anyOpenTag | pp.anyCloseTag).suppress().transformString # expression for parsing <a href="url">text</a> links, returning a (text, url) tuple link = pp.Group(a + a.tag_body("text") + a_end.suppress()) def extract_text_and_url(t): return (t[0].text, t[0].href)
# # htmlTableParser.py # # Example of parsing a simple HTML table into a list of rows, and optionally into a little database # # Copyright 2019, Paul McGuire # import pyparsing as pp import urllib.request # define basic HTML tags, and compose into a Table table, table_end = pp.makeHTMLTags('table') thead, thead_end = pp.makeHTMLTags('thead') tbody, tbody_end = pp.makeHTMLTags('tbody') tr, tr_end = pp.makeHTMLTags('tr') th, th_end = pp.makeHTMLTags('th') td, td_end = pp.makeHTMLTags('td') a, a_end = pp.makeHTMLTags('a') # method to strip HTML tags from a string - will be used to clean up content of table cells strip_html = (pp.anyOpenTag | pp.anyCloseTag).suppress().transformString # expression for parsing <a href="url">text</a> links, returning a (text, url) tuple link = pp.Group(a + pp.SkipTo(a_end)('text') + a_end.suppress()) link.addParseAction(lambda t: (t[0].text, t[0].href)) # method to create table rows of header and data tags def table_row(start_tag, end_tag): body = pp.SkipTo(end_tag)
# getNTPserversNew.py # # Demonstration of the parsing module, implementing a HTML page scanner, # to extract a list of NTP time servers from the NIST web site. # # Copyright 2004-2010, by Paul McGuire # September, 2010 - updated to more current use of setResultsName, new NIST URL # from pyparsing import (Word, Combine, Suppress, SkipTo, nums, makeHTMLTags, delimitedList, alphas, alphanums) import urllib integer = Word(nums) ipAddress = Combine( integer + "." + integer + "." + integer + "." + integer ) hostname = delimitedList(Word(alphas,alphanums+"-_"),".",combine=True) tdStart,tdEnd = makeHTMLTags("td") timeServerPattern = (tdStart + hostname("hostname") + tdEnd + tdStart + ipAddress("ipAddr") + tdEnd + tdStart + SkipTo(tdEnd)("loc") + tdEnd) # get list of time servers nistTimeServerURL = "http://tf.nist.gov/tf-cgi/servers.cgi#" serverListPage = urllib.urlopen( nistTimeServerURL ) serverListHTML = serverListPage.read() serverListPage.close() addrs = {} for srvr,startloc,endloc in timeServerPattern.scanString( serverListHTML ): print "%s (%s) - %s" % (srvr.ipAddr, srvr.hostname.strip(), srvr.loc.strip()) addrs[srvr.ipAddr] = srvr.loc
def data_scrape(master_list_of_links): prefix = 'http://wwww.yelp.com' big_list = [] for i in range(len(master_list_of_links)): time_between_big_links = randint(between_big_links_lower_bound, between_big_links_upper_bound) big_link = prefix + master_list_of_links[i] print big_link print "Scrape initiated" soup = link_opener(big_link) street = soup.find_all("span", itemprop="streetAddress") locality = soup.find_all("span", itemprop="addressLocality") state = soup.find_all("span", itemprop="addressRegion") zip_code = soup.find_all("span", itemprop="postalCode") phone = soup.find_all("span", class_="biz-phone") suffix = '?start=' # review_count specifies how many search pages of reviews you will crawl through. This is set to go through at # most 320 reviews review_count = ['0', '40', '80', '120', '160', '200', '240', '280', '320'] for j in review_count: time_between_review_pages = randint(between_review_pages_lower_bound, between_review_pages_upper_bound) print "processing..." new_link = big_link + suffix + j soup = link_opener(new_link) review_content = soup.find_all("div", class_="review-content") if not review_content: break meta_date = makeHTMLTags('meta')[0] meta_date.setParseAction(withAttribute(itemprop="datePublished")) meta_rating = makeHTMLTags('meta')[0] meta_rating.setParseAction(withAttribute(itemprop="ratingValue")) for k in review_content: indiv_list = [big_link] if not street: indiv_list.append("Missing") else: indiv_list.append(street[0].text) if not locality: indiv_list.append("Missing") else: indiv_list.append(locality[0].text) if not state: indiv_list.append("DC") else: indiv_list.append(state[0].text) if not zip_code: indiv_list.append("Missing") else: indiv_list.append(zip_code[0].text) if not phone: indiv_list.append("Missing") else: indiv_list.append(phone[0].text.strip()) date = next(meta_date.scanString(k))[0] indiv_list.append(date.content) stars = next(meta_rating.scanString(k))[0] indiv_list.append(stars.content) indiv_list.append(k.p.text.encode("utf-8")) big_list.append(indiv_list) time.sleep(time_between_review_pages) print "Scrape complete!" time.sleep(time_between_big_links) print "" return big_list
# URL extractor # Copyright 2004, Paul McGuire from pyparsing import makeHTMLTags from urllib.request import urlopen import pprint # Define the pyparsing grammar for a URL, that is: # URLlink ::= <a href= URL>linkText</a> # URL ::= doubleQuotedString | alphanumericWordPath # Note that whitespace may appear just about anywhere in the link. Note also # that it is not necessary to explicitly show this in the pyparsing grammar; by default, # pyparsing skips over whitespace between tokens. linkOpenTag, linkCloseTag = makeHTMLTags("a") link = linkOpenTag + linkOpenTag.tag_body("body") + linkCloseTag.suppress() # Go get some HTML with some links in it. with urlopen("https://www.cnn.com/") as serverListPage: htmlText = serverListPage.read() # scanString is a generator that loops through the input htmlText, and for each # match yields the tokens and start and end locations (for this application, we are # not interested in the start and end values). for toks, strt, end in link.scanString(htmlText): print(toks.startA.href, "->", toks.body) # Create dictionary from list comprehension, assembled from each pair of tokens returned # from a matched URL. pprint.pprint( {toks.body: toks.startA.href for toks, strt, end in link.scanString(htmlText)} )
# URL extractor # Copyright 2004, Paul McGuire from pyparsing import makeHTMLTags, SkipTo, pyparsing_common import urllib.request from contextlib import closing import pprint linkOpenTag, linkCloseTag = makeHTMLTags('a') linkBody = SkipTo(linkCloseTag) linkBody.setParseAction(pyparsing_common.stripHTMLTags) linkBody.addParseAction(lambda toks: ' '.join(toks[0].strip().split())) link = linkOpenTag + linkBody("body") + linkCloseTag.suppress() # Go get some HTML with some links in it. with closing(urllib.request.urlopen("http://www.yahoo.com")) as serverListPage: htmlText = serverListPage.read().decode("UTF-8") # scanString is a generator that loops through the input htmlText, and for each # match yields the tokens and start and end locations (for this application, we are # not interested in the start and end values). for toks,strt,end in link.scanString(htmlText): print(toks.asList()) # Create dictionary from list comprehension, assembled from each pair of tokens returned # from a matched URL. pprint.pprint( dict((toks.body, toks.href) for toks,strt,end in link.scanString(htmlText)) )