Python makeHTMLTagsの例、pyparsing.makeHTMLTags Pythonの例

コード例 #1

0

ファイルを表示

def _has_attributes(filename: str, tag: str, attrs: dict) -> bool:
    """
    Check ``HTML`` attributes` values.

    This method checks whether the tag (``tag``) inside the code file
    (``filename``) has attributes (``attr``) with the specific values.

    :param filename: Path to the ``HTML`` source.
    :param tag: ``HTML`` tag to search.
    :param attrs: Attributes with values to search.
    :returns: True if attribute set as specified, False otherwise.
    """
    with open(filename, 'r', encoding='latin-1') as handle:
        html_doc = handle.read()

        tag_s, _ = makeHTMLTags(tag)
        tag_expr = tag_s

        result = False

        for expr in tag_expr.searchString(html_doc):
            for attr, value in attrs.items():
                try:
                    value.parseString(getattr(expr, attr))
                    result = True
                except ParseException:
                    result = False
                    break
            if result:
                break
        return result

コード例 #2

0

ファイルを表示

ファイル: emailLister.py プロジェクト: chemberlen/our-py-lib

def getBoldUrls(lines=[],sub=0):
    abstart,abend = pyparsing.makeHTMLTags('B')
    grammer2 = abstart + pyparsing.SkipTo(abend) + abend.suppress()
    for x1,x2,x3 in grammer2.scanString(''.join(lines)):
        print x1
        print x2
        print x3

コード例 #3

0

ファイルを表示

ファイル: pubmed.py プロジェクト: JunZhuSecurity/pypub

def get_pmid_from_summary(raw_xml):
    id_start, id_end = pyparsing.makeHTMLTags("Id")
    id_pattern = id_start.suppress() + pyparsing.Word(pyparsing.nums, min=1)("pmid") + id_end.suppress()
    try:
        pmids = id_pattern.searchString(raw_xml).asList()[0]
    except IndexError:
        pmids = []
    return pmids

コード例 #4

0

ファイルを表示

ファイル: __init__.py プロジェクト: hypnoai/tumblr-theme-parser

 def _extract_meta_options(self):
     """Fill options dictionary with metatags of template."""
     meta_start, meta_end = makeHTMLTags("meta")
     for token, start, end in meta_start.scanString(self.template):
         if ":" in token.name:
             value = token.content
             if token.name.startswith('if:'):
                 value = bool(int(value))
             self.options[token.name] = value

コード例 #5

0

ファイルを表示

ファイル: __init__.py プロジェクト: pabluk/tumblr-theme-parser

 def _extract_meta_options(self):
     """Fill options dictionary with metatags of template."""
     meta_start, meta_end = makeHTMLTags("meta")
     for token, start, end in meta_start.scanString(self.template):
         if ":" in token.name:
             value = token.content
             if token.name.startswith('if:'):
                 value = bool(int(value))
             self.options[token.name] = value

コード例 #6

0

ファイルを表示

def get_pmid_from_summary(raw_xml):
    id_start, id_end = pyparsing.makeHTMLTags("Id")
    id_pattern = id_start.suppress() + pyparsing.Word(
        pyparsing.nums, min=1)("pmid") + id_end.suppress()
    try:
        pmids = id_pattern.searchString(raw_xml).asList()[0]
    except IndexError:
        pmids = []
    return pmids

コード例 #7

0

ファイルを表示

ファイル: __init__.py プロジェクト: martinpucala/tumblr-theme-parser

 def _extract_meta_options(self):
     """Fill options dictionary with metatags of template."""
     meta_start, meta_end = makeHTMLTags("meta")
     for token, start, end in meta_start.scanString(self.template):
         if ":" in token.name:
             value = token.content
             if token.name.startswith('if:'):
                 value = bool(int(value))
                 key = token.name.replace('if:', '')
                 key = ''.join(word.capitalize() for word in re.split(r'\s+', key))
             self.options[token.name] = value

コード例 #8

0

ファイルを表示

ファイル: myParser2.py プロジェクト: chemberlen/our-py-lib

def getUrls(lines=[]):

    grammer = ''

    astart,aend = pyparsing.makeHTMLTags('a')
    grammer = astart + pyparsing.SkipTo(aend) + aend.suppress()

    urls = []
    for x1,x2,x3 in grammer.scanString(''.join(lines)):
        urls.append(str(x1[1][1]))
    return urls

コード例 #9

0

ファイルを表示

ファイル: daum.py プロジェクト: byunghyunpark/popcorn-backend

def video_search(request):
    video_response = requests.get(request)
    bs_videos = BeautifulSoup(video_response.text, 'html.parser')
    meta, metaEng = makeHTMLTags("meta")
    img_meta = meta.copy().setParseAction(
        withAttribute(('property', 'og:image')))
    for img in img_meta.searchString(bs_videos):
        content = img.content
        video_trailer_id = content.split("/")[-2]
        video_trailer_url = "http://videofarm.daum.net/controller/video/viewer/Video.html?vid={}&play_loc=daum_movie&autoplay=true".format(
            video_trailer_id)
    return video_trailer_url

コード例 #10

0

ファイルを表示

ファイル: rerun_autoconf.py プロジェクト: agander/gander

def pyparsing():
    from pyparsing import makeHTMLTags, SkipTo, htmlComment
    import urllib
    serverListPage = \
    urllib.urlopen( "http://agander.home/" )
    htmlText = serverListPage.read()
    serverListPage.close()
    aStart, aEnd = makeHTMLTags("A")
    link = aStart + SkipTo(aEnd).setResultsName("link") + aEnd
    link.ignore(htmlComment)
    for toks, start, end in link.scanString(htmlText):
        print('{} -> {}'.format(toks.link, toks.startA.href))

コード例 #11

0

ファイルを表示

ファイル: emailLister3.py プロジェクト: chemberlen/our-py-lib

def getUrls(lines=[], sub=0):
    grammer = ''
    astart, aend = pyparsing.makeHTMLTags('a')
    grammer = astart + pyparsing.SkipTo(aend) + aend.suppress()
    urls = []
    for x1, x2, x3 in grammer.scanString(''.join(lines)):
        if sub:
            if len(x1) == 5:
                print x1[4]
            if len(x1) == 6:
                urls.append(str(x1[5]))
        else:
            urls.append(str(x1[1][1]))

    return urls

コード例 #12

0

ファイルを表示

ファイル: emailLister3.py プロジェクト: chemberlen/our-py-lib

def getEmailUrls(lines=[]):
    grammer = ''
    astart, aend = pyparsing.makeHTMLTags('a')
    grammer = astart + pyparsing.SkipTo(aend) + aend.suppress()
    urls = []
    for x1, x2, x3 in grammer.scanString(''.join(lines)):
        if len(x1) == 5:
            urls.append(x1[4])

    for eachUrls in urls:
        if eachUrls.find('alt="Search ') > 0:
            data = eachUrls[eachUrls.find('alt="Search ') +
                            len('alt="Search '):len(eachUrls) - 2]
            print data

    return urls

コード例 #13

0

ファイルを表示

    def get_linked_articles(self, wikipage):

        # Define the pyparsing grammar for a URL, that is:
        #    URLlink ::= <a href= URL>linkText</a>
        #    URL ::= doubleQuotedString | alphanumericWordPath
        # Note that whitespace may appear just about anywhere in the link.  Note also
        # that it is not necessary to explicitly show this in the pyparsing grammar; by default,
        # pyparsing skips over whitespace between tokens.

        linkOpenTag, linkCloseTag = makeHTMLTags("a")
        link = linkOpenTag + SkipTo(linkCloseTag).setResultsName(
            "body") + linkCloseTag.suppress()

        # Go get some HTML with some links in it.
        # serverListPage = urllib.urlopen( "http://de.wikipedia.org/w/index.php?title=Hauptseite&redirect=no" )
        # htmlText = serverListPage.read()
        # serverListPage.close()
        #
        # print htmlText
        #
        # scanString is a generator that loops through the input htmlText, and for each
        # match yields the tokens and start and end locations (for this application, we are
        # not interested in the start and end values).

        articles = set()

        for toks, strt, end in link.scanString(wikipage):
            if (len(toks.startA.href) != 0 and  #remove empty links
                    toks.startA.href.find('#') == -1 and  #remove anchors
                    toks.startA.href.find(':') == -1
                    and  #remove wikipedia special links
                    toks.startA.href.find('?') == -1
                    and  #remove wikipedia special links
                    toks.startA.href.find('Hauptseite')
                    == -1):  #remove link to main page
                if (toks.body == "Artikel"):
                    articlename = toks.startA.href.lstrip(
                        '/wiki/'
                    )  #save real(!) article name, so we don't get confused by redirects
                else:
                    articles.add(toks.startA.href.lstrip('/wiki/'))
                #print toks.startA.href,"->",toks.body

        return (articles, articlename)

コード例 #14

0

ファイルを表示

ファイル: search.py プロジェクト: floft/flowgen

def parse(db,url):
	global add
	global urls

	try:
		if not re.search('^http://',url):
			url=siteurl+"/"+url
			url="http://"+url.replace("//","/")

		request=urllib.request.Request(url)
		request.add_header('User-Agent', 'Flowgen/1.0 (http://floft.net)')
		page=urllib.request.urlopen(request)
		html=page.read().decode("utf-8")
		page.close()

		print("Notice: processing {}".format(url))

		#get urls
		linkOpenTag,linkCloseTag = makeHTMLTags("a")
		link = linkOpenTag + SkipTo(linkCloseTag).setResultsName("body") + linkCloseTag.suppress()

		for toks,strt,end in link.scanString(html):
			newurl=toks.startA.href

			if newurl not in urls and newurl not in visited:
				if re.search('^(/|http://'+siteurl+')',newurl) and not \
				   re.search('(jpg|png|flac|mp3|zip|pdf)$',newurl):
					urls.append(newurl)

		#get title
		try:
			title=re.search('<title>([^<]*)</title>',html).groups()[0]
		except:
			title="Untitled"
		
		#get text
		xml=lxml.html.document_fromstring(html.replace(">","> ").replace("<", " <"))
		text=xml.cssselect('body')[0].text_content().replace("\n"," ").strip()

		#add to database
		add.append([time(),title,url,text])
	except:
		print("Error: {} does not load".format(url))

コード例 #15

0

ファイルを表示

ファイル: emailLister.py プロジェクト: chemberlen/our-py-lib

def getUrls(lines=[],sub=0):
    grammer = ''
    astart,aend = pyparsing.makeHTMLTags('a')
    grammer = astart + pyparsing.SkipTo(aend) + aend.suppress()
##    grammer2 = abstart + pyparsing.SkipTo(abend) + abend.suppress()
##    for x1,x2,x3 in grammer.scanString(''.join(lines)):
##        print x1
##        print x2
##        print x3



    urls = []
    for x1,x2,x3 in grammer.scanString(''.join(lines)):
        if sub:
            if len(x1)==6:
                mailreadstatus = 1 if str(x1[1]).find('db_chkstatus=1')>0 else 0
                mailsubject = str(x1[5])
                maillink = str(x1[1][1])
                nxtSender=True
            if len(x1)==5 and nxtSender:
                mailsender = ''
                if dict(x1).has_key('title'):
                    mailsender = str(x1['title']).replace('Search ','')
                    nxtTime=True
                    nxtSender=False
            if len(x1)==4 and nxtTime:
                mailtime = x1[3]
                urls.append([mailsubject,mailreadstatus,mailsender,mailtime,maillink])
                nxtSender=False
                nxtTime=False
                mailreadstatus=0
                mailsubject = ''
                mailsender=''
                mailtime = ''
                maillink = ''

        else:
            urls.append(str(x1[1][1]))
            nxtSender=False
            nxtTime=False
    return urls

コード例 #16

0

ファイルを表示

ファイル: wiki_tokenizer.py プロジェクト: egoralvolk/pywikitext

    def initGrammar(self):
        L_Equals = Word("=")
        N_comment = htmlComment()

        N_name = CharsNotIn("{}|[]")
        N_simpleText = SkipTo(
            oneOf(["{{", "|", "[[", "]]", "}}", "'''", "<ref"]))

        N_elements = Forward()
        N_apostrofs = QuotedString("'''").setParseAction(
            lambda s, l, t: {'APOSTROFS': t})
        N_link = nestedExpr(
            opener="[[",
            closer="]]",
            content=N_name +
            Optional("|" + delimitedList(CharsNotIn("[]"), delim="|"))
        ).setParseAction(self.genLink)
        N_header = Group(L_Equals + SkipTo("=") + L_Equals).setParseAction(
            lambda s, l, t: {'HEADER': t})
        N_template = Forward()
        N_key = CharsNotIn("{}|=")
        # N_value = ZeroOrMore(CharsNotIn("{}|")) + ZeroOrMore(N_template + ZeroOrMore(CharsNotIn("{}|"))).setResultsName('VALUE')
        N_keyValues = "|" + delimitedList(
            Group(Optional(N_key) + Optional("=" + N_elements)), delim="|")
        N_label_content = N_template | ("{{" + OneOrMore("!") +
                                        "}}") | CharsNotIn("{}|")
        N_label = nestedExpr(opener="{", closer="}", content=N_label_content)
        N_template << nestedExpr(
            opener="{{", closer="}}", content=N_name +
            Optional(N_keyValues)).setParseAction(self.genTemplate)

        ref_start, ref_end = makeHTMLTags("ref")
        N_named_ref = ref_start + SkipTo(ref_end) + ref_end
        N_named_ref.setParseAction(lambda s, l, t: {'REF': t})

        N_element = N_comment | N_simpleText | N_named_ref | N_apostrofs | N_link | N_header | N_template | N_label

        # N_ref = nestedExpr( opener="<ref>", closer="</ref>", content=N_elements).setParseAction( lambda s,l,t: {'REF' : t} )
        N_elements << ZeroOrMore(N_element)

        self.N_S = N_elements

コード例 #17

0

ファイルを表示

ファイル: backup.py プロジェクト: nadaoneal/gapps_python_whatever

def getUseriCalZip(todaysDate, userName, samlResponse, br, logOut):
    samlResponseText = samlResponse.read()
    theStart,theEnd = makeHTMLTags("textarea")
    search = theStart + SkipTo(theEnd)("body")+ theEnd
            
    saml_resp_str = search.searchString(samlResponseText)[0].body
    relay_state_str = search.searchString(samlResponseText)[1].body
    
    fileNametoSave = todaysDate + "/" + userName + ".zip"
    
    br.select_form(name="acsForm")
    br["SAMLResponse"] = saml_resp_str
    br["RelayState"] = relay_state_str
    try: 
        br.submit()
    except:
        print "WARN: trouble downloading cal data for " + userName + ": " +  " at " + time.strftime('%H:%M') + ".\n"
        logOut.flush()
        time.sleep(60)
        try:
            br.submit()
        except:
            print "FAIL: can't open cal session for " + userName + ": " +  " at " + time.strftime('%H:%M') + ".\n"
            logOut.flush()
        else:
            print "OKAY - second try - retrieving cal data for user " + userName + " at " + time.strftime('%H:%M') + ".\n"
            logOut.flush()
            try: 
                br.retrieve('https://www.google.com/calendar/exporticalzip',fileNametoSave) 
            except:
                print "FAIL: can't open cal session for " + userName + ": " +  " at " + time.strftime('%H:%M') + ".\n"
                logOut.flush()
    else:
        print "Retrieving cal data for user " + userName + " at " + time.strftime('%H:%M') + ".\n"
        logOut.flush()
        try:
            br.retrieve('https://www.google.com/calendar/exporticalzip',fileNametoSave)
        except:
            print "FAIL: can't download cal data for " + userName + ": " +  " at " + time.strftime('%H:%M') + ".\n"
            logOut.flush()

コード例 #18

0

ファイルを表示

def lr1():
    import urllib.request
    import urllib.parse
    # import requests
    from pyparsing import makeHTMLTags, SkipTo, withAttribute
    from prettytable import PrettyTable

    print("Parsing https://www.worldcoinindex.com/")
    url = 'https://www.worldcoinindex.com'
    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    resp = urllib.request.urlopen(req)
    respData = str(resp.read())
    resp.close()
    tbody_Start, tbody_End = makeHTMLTags('tbody')
    tbody = tbody_Start + SkipTo(tbody_End)("body") + tbody_End
    tbody_string = ""
    for tokens, start, end in tbody.scanString(respData):
        tbody_string = tbody_string + tokens.body
    # print(tbody_string)

    # creating a list for bitcoin names
    btc = []
    # parsing bitcoin names
    h1_Start, h1_End = makeHTMLTags('h1')
    h1_body = h1_Start + SkipTo(h1_End)("body") + h1_End
    bitcoin_name = ""
    for tokens, start, end in h1_body.scanString(tbody_string):
        bitcoin_name = bitcoin_name + "\n" + tokens.body

    # getting rid of <span>
    span_start, span_end = makeHTMLTags("span")
    span_body = span_start + SkipTo(span_start | span_end)("body")
    for tokens, start, end in span_body.scanString(bitcoin_name):
        btc.append(tokens.body)

    # creating a list for bitcoin prices
    prices = []
    # parsing bitcoin prices
    price_start, price_end = makeHTMLTags('td')
    price_td = price_start.setParseAction(
        withAttribute(**{"class": "number pricekoers lastprice"}))
    price_body = price_td + SkipTo(price_start | price_end)("body")
    price_string = ""
    for tokens, start, end in price_body.scanString(respData):
        price_string = price_string + "\n" + tokens.body

    # getting rid of <span>
    span_class = span_start.setParseAction(withAttribute(**{"class": "span"}))
    span_body = span_class + SkipTo(span_class | span_end)("body")
    for tokens, start, end in span_body.scanString(price_string):
        prices.append(tokens.body)
    # print(prices)

    # generating PrettyTable
    t = PrettyTable()
    t.field_names = [" ", "Name", "Resent Price"]
    i = 0
    for x in btc:
        t.add_row([i + 1, x, prices[i]])
        i = i + 1
    t.align["Name"] = "c"
    t.align["Recent Price"] = "c"
    print(t)

    # saving data
    f = open('logs.txt', 'w')
    f.writelines(str(t))
    f.close()

コード例 #19

0

ファイルを表示

ファイル: pyparsing_html.py プロジェクト: jstrickler/20190311SOCOM

#!/usr/bin/env python
# (c) 2016 John Strickler
#
import requests
from pprint import pprint
import pyparsing as pp

response = requests.get('http://www.python.org')

html = response.content

link_start, link_end = pp.makeHTMLTags('a')
link_content = pp.SkipTo(link_end).setResultsName('link')
full_link = link_start + link_content + link_end.suppress()

count = 1
for token, _, _ in full_link.scanString(html):
    if token.href.lower().startswith('http'):
        print("LINK {}:".format(count))
        print("{} ==> {}".format(token.link, token.href))
        print(('-' * 60))
        count += 1

コード例 #20

0

ファイルを表示

from pyparsing import makeHTMLTags,SkipTo,htmlComment
import urllib.request, urllib.parse, urllib.error

serverListPage = urllib.request.urlopen( "http://www.yahoo.com" )
htmlText = serverListPage.read()
serverListPage.close()

aStart,aEnd = makeHTMLTags("A")

link = aStart + SkipTo(aEnd).setResultsName("link") + aEnd
link.ignore(htmlComment)

for toks,start,end in link.scanString(htmlText):
    print(toks.link, "->", toks.startA.href)

コード例 #21

0

ファイルを表示

ファイル: kml.py プロジェクト: dfeeney/datachef

#http://pyparsing.wikispaces.com/HowToUsePyparsing
from pyparsing import Word, alphas, alphanums, nums, Literal, restOfLine, OneOrMore, \
    empty, Suppress, replaceWith, Group, Optional
from pyparsing import makeHTMLTags, withAttribute, SkipTo
from pyparsing import Suppress

#Dirty, dirty hack. Move to proper XML soonest

#td_start, td_end = makeHTMLTags("td")
pm_start, pm_end = makeHTMLTags("Placemark")
name_start, name_end = makeHTMLTags("name")
desc_start, desc_end = makeHTMLTags("description")
poly_start, poly_end = makeHTMLTags("Polygon")
ed_start, ed_end = makeHTMLTags("ExtendedData")


placemark_set = ( pm_start 
               #+ SkipTo(name_start) + name_start + SkipTo(name_end)("name") + name_end #Note: this might be a CDATA Section
               + Optional(SkipTo(desc_start) + desc_start + SkipTo(desc_end)("description") + desc_end)
               #+ SkipTo(desc_start) + desc_start + "<![CDATA[" + SkipTo("]]>")("description") + ']]>' + desc_end
               + SkipTo(poly_start) + poly_start + SkipTo(poly_end)("polygon") + poly_end
               + Optional(SkipTo(ed_start) + Group(SkipTo(ed_end) + ed_end)("ext_data"))
               + SkipTo(pm_end) + pm_end
             )


def gen_record(text):
    for data, startloc, endloc in placemark_set.scanString(text):
        yield data

    #print data.pid[1], ': ' , data.coords.split()[0].rsplit(',', 1)[0]

コード例 #22

0

ファイルを表示

ファイル: main.py プロジェクト: tanks/readabilitypy

 def trans_tag(self, ltext, tag, fun):
     aopen, aclose = pyparsing.makeHTMLTags(tag)
     a = aopen + pyparsing.SkipTo(aclose).setResultsName("body") + aclose
     a.setParseAction(fun)
     ltext = a.transformString(ltext)
     return ltext

コード例 #23

0

ファイルを表示

ファイル: htmlStripper.py プロジェクト: svn2github/pyparsing

#
# htmlStripper.py
#
#  Sample code for stripping HTML markup tags and scripts from 
#  HTML source files.
#
# Copyright (c) 2006, 2016, Paul McGuire
#
from contextlib import closing
import urllib.request, urllib.parse, urllib.error
from pyparsing import (makeHTMLTags, SkipTo, commonHTMLEntity, replaceHTMLEntity, 
    htmlComment, anyOpenTag, anyCloseTag, LineEnd, OneOrMore, replaceWith)

scriptOpen,scriptClose = makeHTMLTags("script")
scriptBody = scriptOpen + SkipTo(scriptClose) + scriptClose
commonHTMLEntity.setParseAction(replaceHTMLEntity)

# get some HTML
targetURL = "http://wiki.python.org/moin/PythonDecoratorLibrary"
with closing(urllib.request.urlopen( targetURL )) as targetPage:
    targetHTML = targetPage.read().decode("UTF-8")

# first pass, strip out tags and translate entities
firstPass = (htmlComment | scriptBody | commonHTMLEntity | 
             anyOpenTag | anyCloseTag ).suppress().transformString(targetHTML)

# first pass leaves many blank lines, collapse these down
repeatedNewlines = LineEnd() + OneOrMore(LineEnd())
repeatedNewlines.setParseAction(replaceWith("\n\n"))
secondPass = repeatedNewlines.transformString(firstPass)

コード例 #24

0

ファイルを表示

ファイル: withAttribute.py プロジェクト: delavodix/pyparsing

#
#  withAttribute.py
#  Copyright, 2007 - Paul McGuire
#
#  Simple example of using withAttribute parse action helper
#  to define
#
import pyparsing as pp

data = """\
    <td align=right width=80><font size=2 face="New Times Roman,Times,Serif">&nbsp;49.950&nbsp;</font></td>
    <td align=left width=80><font size=2 face="New Times Roman,Times,Serif">&nbsp;50.950&nbsp;</font></td>
    <td align=right width=80><font size=2 face="New Times Roman,Times,Serif">&nbsp;51.950&nbsp;</font></td>
    """

td, tdEnd = pp.makeHTMLTags("TD")
font, fontEnd = pp.makeHTMLTags("FONT")
realNum = pp.pyparsing_common.real
NBSP = pp.Literal("&nbsp;")
patt = td + font + NBSP + realNum("value") + NBSP + fontEnd + tdEnd

# always use addParseAction when adding withAttribute as a parse action to a start tag
td.addParseAction(pp.withAttribute(align="right", width="80"))

for s in patt.searchString(data):
    print(s.value)

コード例 #25

0

ファイルを表示

#
# Copyright (c) 2006, 2016, Paul McGuire
#
from urllib.request import urlopen
from pyparsing import (
    makeHTMLTags,
    commonHTMLEntity,
    replaceHTMLEntity,
    htmlComment,
    anyOpenTag,
    anyCloseTag,
    LineEnd,
    replaceWith,
)

scriptOpen, scriptClose = makeHTMLTags("script")
scriptBody = scriptOpen + scriptOpen.tag_body + scriptClose
commonHTMLEntity.setParseAction(replaceHTMLEntity)

# get some HTML
targetURL = "https://wiki.python.org/moin/PythonDecoratorLibrary"
with urlopen(targetURL) as targetPage:
    targetHTML = targetPage.read().decode("UTF-8")

# first pass, strip out tags and translate entities
firstPass = ((htmlComment | scriptBody | commonHTMLEntity | anyOpenTag
              | anyCloseTag).suppress().transformString(targetHTML))

# first pass leaves many blank lines, collapse these down
repeatedNewlines = LineEnd() * (2, )
repeatedNewlines.setParseAction(replaceWith("\n\n"))

コード例 #26

0

ファイルを表示

ファイル: pyparse.py プロジェクト: nulpoet/Amazon-Referral-Network

import cgi
from pyparsing import makeHTMLTags, SkipTo

raw = """<body><div class="shoveler" id="purchaseShvl">
<p>Customers who bought this item also bought</p>
<div class="foo">
    <span class="bar">Shovel cozy</span>
    <span class="bar">Shovel rack</span>
</div>
</div></body>"""

def foo(parseResult):
    parts = []
    for token in parseResult:
        st = '<div id="%s" class="%s">' % \
             (cgi.escape(getattr(token, 'id')),
             cgi.escape(getattr(token, 'class')))
        parts.append(st + token.body + token.endDiv)
    return '\n'.join(parts)

start, end = makeHTMLTags('div')
anchor = start + SkipTo(end).setResultsName('body') + end
res = anchor.searchString(raw)
print foo(res)

コード例 #27

0

ファイルを表示

ファイル: parseBOMdata.py プロジェクト: jimjshields/python_learning

from pyparsing import makeHTMLTags, withAttribute, Suppress, Regex, Group
import urllib

year = '2014'

conn = urllib.urlopen('http://www.boxofficemojo.com/yearly/chart/?yr=' + year +
                      '&p=.htm')
""" looking for this recurring pattern:
          <td valign="top" tdalign="center">00-03</td>
          <td valign="top">.50</td>
          <td valign="top">.50</td>

    and want a dict with keys 0, 1, 2, and 3 all with values (.50,.50)
"""

td, tdend = makeHTMLTags("td")
keytd = td.copy().setParseAction(withAttribute(tdalign="center"))
td, tdend, keytd = map(Suppress, (td, tdend, keytd))

# realnum = Regex(r'1?\.\d+').setParseAction(lambda t:float(t[0]))
# integer = Regex(r'\d{1,3}').setParseAction(lambda t:int(t[0]))
DASH = Suppress('-')

# build up an expression matching the HTML bits above
entryExpr = (keytd + tdend + Group(2 * (td + tdend))("vals"))

# search the input HTML for matches to the entryExpr expression, and build up lookup dict
lookup = {}
for entry in entryExpr.searchString(conn):
    for i in range(entry.start, entry.end + 1):
        lookup[i] = tuple(entry.vals)

コード例 #28

0

ファイルを表示

ファイル: makeHTMLTagExample.py プロジェクト: FranArleynDynamicDuo/ing_Software_1

import urllib

from pyparsing import makeHTMLTags, SkipTo

# read HTML from a web page
serverListPage = urllib.urlopen( "http://www.yahoo.com" )
htmlText = serverListPage.read()
serverListPage.close()

# using makeHTMLTags to define opening and closing tags
anchorStart,anchorEnd = makeHTMLTags("a")

# compose an expression for an anchored reference
anchor = anchorStart + SkipTo(anchorEnd)("body") + anchorEnd

# use scanString to scan through the HTML source, extracting
# just the anchor tags and their associated body text
# (note the href attribute of the opening A tag is available
# as an attribute in the returned parse results)
for tokens,start,end in anchor.scanString(htmlText):
    print tokens.body,'->',tokens.href

コード例 #29

0

ファイルを表示

def movie_search(keyword, daum_id=None):
    r = requests.get(
        "https://apis.daum.net/contents/movie?apikey={}&q={}&output=json".
        format(settings.DAUM_API_KEY, keyword))
    movie_search = r.json()
    movies_search = []
    num_of_movies = movie_search.get("channel").get("totalCount")

    for num in range(num_of_movies):
        img_url = movie_search.get("channel").get("item")[int(num)].get(
            "thumbnail")[0].get("content")
        # 이미지 사이즈 (S M L)
        image_split = img_url.rsplit('/', 5)
        index = 4
        replacement = ['R200x0.q99', 'R500x0.q99', 'R700x0.q99']
        movie_img_url = []
        for nums in range(3):
            image_split[index] = replacement[nums]
            movie_img_url.append('/'.join(image_split))

        title_link = movie_search.get("channel").get("item")[int(num)].get(
            "title")[0].get("link")
        daum_id = re.findall(r'\d+', title_link)
        title_kor = movie_search.get("channel").get("item")[int(num)].get(
            "title")[0].get("content")
        title_eng = movie_search.get("channel").get("item")[int(num)].get(
            "eng_title")[0].get("content")
        created_year = movie_search.get("channel").get("item")[int(num)].get(
            "year")[0].get("content")
        run_time = movie_search.get("channel").get("item")[int(num)].get(
            "open_info")[2].get("content")
        grade = movie_search.get("channel").get("item")[int(num)].get(
            "open_info")[1].get("content")
        synopsis = movie_search.get("channel").get("item")[int(num)].get(
            "story")[0].get("content")

        photo_list = []
        count = 1
        while True:
            try:
                photos = movie_search.get("channel").get("item")[int(num)].get(
                    "photo{}".format(count)).get("content")
                photo_list.append(photos)
                count += 1
            except:
                break

        resized_photo_url = []
        for image in photo_list:
            image_split = image.rsplit('/', 5)
            index = 4
            replacement = ['R200x0.q99', 'R500x0.q99', 'R700x0.q99']
            each_movie_photo_url = []
            for nums in range(3):
                image_split[index] = replacement[nums]
                each_movie_photo_url.append('/'.join(image_split))
            resized_photo_url.append(each_movie_photo_url)

        count = 0
        nation_list = []
        while True:
            try:
                nations = movie_search.get("channel").get("item")[int(
                    num)].get("nation")[count].get("content")
                nation_list.append(nations)
                count += 1
            except:
                break

        count = 0
        genre_list = []
        while True:
            try:
                genres = movie_search.get("channel").get("item")[int(num)].get(
                    "genre")[count].get("content")
                genre_list.append(genres)
                count += 1
            except:
                break

        director_info = []
        actor_info = []
        try:
            title_link = movie_search.get("channel").get("item")[int(num)].get(
                "title")[0].get("link")
            response = requests.get(title_link)
            bs = BeautifulSoup(response.text, "html.parser")
            count = 0

            while True:
                used_link = bs.select("ul.list_join li")[count]

                # 역할
                actor_role = used_link.select('span.txt_join')[0].text
                if "감독" in actor_role:
                    name_kor = used_link.select('em.emph_point')[0].text
                    name_kor_eng = used_link.select('strong.tit_join')[0].text
                    len_of_name_kor = len(name_kor) + 1
                    # 영문 이름
                    name_eng = name_kor_eng[len_of_name_kor:]
                    a_tag = used_link.findAll(
                        'a', attrs={'href': re.compile("/person/")})[0]
                    # 배우 아이디
                    actor_id = re.findall(r'\d+', a_tag['href'])
                    img_tag = used_link.select("img")[0]
                    # 배우 사진
                    profile_url = img_tag['src']

                    director_info.append({
                        'daum_id': actor_id,
                        'name_eng': name_eng,
                        'name_kor': name_kor,
                        'profile_url': profile_url
                    })
                    count += 1
                else:
                    name_kor = used_link.select('em.emph_point')[0].text
                    name_kor_eng = used_link.select('strong.tit_join')[0].text
                    len_of_name_kor = len(name_kor) + 1
                    # 영문 이름
                    name_eng = name_kor_eng[len_of_name_kor:]
                    a_tag = used_link.findAll(
                        'a', attrs={'href': re.compile("/person/")})[0]
                    # 배우 아이디
                    actor_id = re.findall(r'\d+', a_tag['href'])
                    img_tag = used_link.select("img")[0]
                    # 배우 사진
                    profile_url = img_tag['src']

                    actor_info.append({
                        'daum_id': actor_id,
                        'name_eng': name_eng,
                        'name_kor': name_kor,
                        'profile_url': profile_url,
                        'character_name': actor_role
                    })
                    count += 1
        except:
            pass

        video_list = []
        count = 0
        while True:
            try:
                videos = movie_search.get("channel").get("item")[int(num)].get(
                    "video")[count].get("link")
                if videos:
                    response_videos = requests.get(videos)
                    bs_videos = BeautifulSoup(response_videos.text,
                                              "html.parser")
                    meta, metaEnd = makeHTMLTags("meta")
                    img_meta = meta.copy().setParseAction(
                        withAttribute(('property', 'og:image')))
                    img_ref = img_meta
                    for img in img_ref.searchString(bs_videos):
                        content = img.content
                    video_trailer_id = content.split("/")[-2]
                    video_trailer_url = "http://videofarm.daum.net/controller/video/viewer/Video.html?vid={}&play_loc=daum_movie&autoplay=true".format(
                        video_trailer_id)
                    video_list.append(video_trailer_url)
                count += 1
            except:
                break

        trailer_link = movie_search.get("channel").get("item")[int(num)].get(
            "trailer")[0].get("link")
        if trailer_link:
            response = requests.get(trailer_link)
            bs = BeautifulSoup(response.text, "html.parser")
            meta, metaEnd = makeHTMLTags("meta")
            img_meta = meta.copy().setParseAction(
                withAttribute(('property', 'og:image')))
            img_ref = img_meta
            for img in img_ref.searchString(bs):
                content = img.content
            trailer_id = content.split("/")[-2]
            trailer_url = "http://videofarm.daum.net/controller/video/viewer/Video.html?vid={}&play_loc=daum_movie&autoplay=true".format(
                trailer_id)
        movies_search.append({
            'title_kor': title_kor,
            'title_eng': title_eng,
            'nation_list': nation_list,
            # 'created_year': created_year,
            'img_url': movie_img_url,
            'run_time': run_time,
            'grade': grade,
            'director_info': director_info,
            'actor_info': actor_info,
            'genre_list': genre_list,
            # 'synopsis': synopsis,
            # 'photo_list': photo_list,
            # 'video_list': video_list,
        })

        if daum_id:
            for genres in genre_list:
                try:
                    genre = Genre.objects.create(genre=genres, )
                except:
                    genre = Genre.objects.filter(genre=genres)

            try:
                grade = Grade.objects.create(grade=grade, )
            except:
                grade = Grade.objects.get(grade=grade)

            for nations in nation_list:
                try:
                    nation = MakingCountry.objects.create(
                        making_country=nations, )
                except:
                    pass
                    nation = MakingCountry.objects.filter(
                        making_country=nations)

            movie = Movie.objects.create(
                daum_id=daum_id[0],
                title_kor=title_kor,
                title_eng=title_eng,
                created_year=created_year,
                synopsis=synopsis,
                grade=grade,
                run_time=run_time,
                img_url=movie_img_url,
            )

            for actor in actor_info:
                actors = Actor.objects.get_or_create(
                    daum_id=actor['daum_id'][0],
                    name_eng=actor['name_eng'],
                    name_kor=actor['name_kor'],
                    profile_url=actor['profile_url'])
                movie_actor = MovieActor.objects.get_or_create(
                    movie=movie,
                    actor=actors[0],
                    character_name=actor['character_name'])

            for directors in director_info:
                director = Director.objects.get_or_create(
                    daum_id=directors['daum_id'][0],
                    name_eng=directors['name_eng'],
                    name_kor=directors['name_kor'],
                    profile_url=directors['profile_url'])

            for photo in resized_photo_url:
                try:
                    movie_image = MovieImages.objects.create(
                        movie=movie,
                        url=photo,
                    )
                except:
                    pass

            specific_movie = Movie.objects.get(daum_id=daum_id[0])
            for genre in genre_list:
                g, created = Genre.objects.get_or_create(genre=genre)
                specific_movie.genre.add(g)

            for nation in nation_list:
                n, created = MakingCountry.objects.get_or_create(
                    making_country=nation)
                specific_movie.making_country.add(n)

            for director in director_info:
                d, created = Director.objects.get_or_create(
                    daum_id=director['daum_id'][0],
                    name_eng=director['name_eng'],
                    name_kor=director['name_kor'],
                    profile_url=director['profile_url'])
                specific_movie.director.add(d)
    return movies_search

コード例 #30

0

ファイルを表示

ファイル: parse.py プロジェクト: reidnilson/Python

from pyparsing import makeHTMLTags
import urllib

# read data from web page
url = "https://www.cia.gov/library/"\
        "publications/the-world-"\
        "factbook/docs/refmaps.html"
html = urllib.urlopen(url).read()

# define expression for <img> tag
imgTag,endImgTag = makeHTMLTags("img")

# search for matching tags, and
# print key attributes
for img in imgTag.searchString(html):
    print "'%(alt)s' : %(src)s" % img

コード例 #31

0

ファイルを表示

ファイル: style_guide_codmark.py プロジェクト: zroehr/openguide

import yaml
import argparse
import jsonpickle
import json
import markdown
import re

from pygments import highlight
from pygments.lexers import guess_lexer
from pygments.formatters import HtmlFormatter

from pyparsing import makeHTMLTags, replaceWith, withAttribute

cod = "../frontend/node_modules/.bin/cod"

spanOpen, spanClose = makeHTMLTags("span")
emptySpans = spanOpen.copy().setParseAction(withAttribute(empty=True))
removeSpans = emptySpans | spanOpen + spanClose
removeSpans.setParseAction(replaceWith(" "))

extensions = ['.less', '.css', '.sass', '.scss']
markup_blocks = {}
formatter = HtmlFormatter(cssclass='source-highlight')


def highlight_source(source):
    if not source: return ''
    lexer = guess_lexer(source)
    return highlight(source, lexer, formatter)

コード例 #32

0

ファイルを表示

ファイル: getNTPserversNew.py プロジェクト: tinylambda/pyparsing

# Copyright 2004-2010, by Paul McGuire
# September, 2010 - updated to more current use of setResultsName, new NIST URL
#
from pyparsing import (Word, Combine, SkipTo, nums, makeHTMLTags,
                       delimitedList, alphas, alphanums)
try:
    import urllib.request
    urlopen = urllib.request.urlopen
except ImportError:
    import urllib
    urlopen = urllib.urlopen

integer = Word(nums)
ipAddress = Combine(integer + "." + integer + "." + integer + "." + integer)
hostname = delimitedList(Word(alphas, alphanums + "-_"), ".", combine=True)
tdStart, tdEnd = makeHTMLTags("td")
timeServerPattern = (tdStart + hostname("hostname") + tdEnd + tdStart +
                     ipAddress("ipAddr") + tdEnd + tdStart +
                     SkipTo(tdEnd)("loc") + tdEnd)

# get list of time servers
nistTimeServerURL = "https://tf.nist.gov/tf-cgi/servers.cgi#"
serverListPage = urlopen(nistTimeServerURL)
serverListHTML = serverListPage.read().decode("UTF-8")
serverListPage.close()

addrs = {}
for srvr, startloc, endloc in timeServerPattern.scanString(serverListHTML):
    print("{} ({}) - {}".format(srvr.ipAddr, srvr.hostname.strip(),
                                srvr.loc.strip()))
    addrs[srvr.ipAddr] = srvr.loc

コード例 #33

0

ファイルを表示

ファイル: makeHTMLTagExample.py プロジェクト: darlinghq/darling-python_modules

import urllib.request, urllib.parse, urllib.error

from pyparsing import makeHTMLTags, SkipTo

# read HTML from a web page
serverListPage = urllib.request.urlopen( "http://www.yahoo.com" )
htmlText = serverListPage.read()
serverListPage.close()

# using makeHTMLTags to define opening and closing tags
anchorStart,anchorEnd = makeHTMLTags("a")

# compose an expression for an anchored reference
anchor = anchorStart + SkipTo(anchorEnd)("body") + anchorEnd

# use scanString to scan through the HTML source, extracting
# just the anchor tags and their associated body text
# (note the href attribute of the opening A tag is available
# as an attribute in the returned parse results)
for tokens,start,end in anchor.scanString(htmlText):
    print(tokens.body,'->',tokens.href)

コード例 #34

0

ファイルを表示

ファイル: parseBOMdata.py プロジェクト: jimjshields/python_learning

from pyparsing import makeHTMLTags,withAttribute,Suppress,Regex,Group
import urllib

year = '2014'

conn = urllib.urlopen('http://www.boxofficemojo.com/yearly/chart/?yr=' + year + '&p=.htm')

""" looking for this recurring pattern:
          <td valign="top" tdalign="center">00-03</td>
          <td valign="top">.50</td>
          <td valign="top">.50</td>

    and want a dict with keys 0, 1, 2, and 3 all with values (.50,.50)
"""

td,tdend = makeHTMLTags("td")
keytd = td.copy().setParseAction(withAttribute(tdalign="center"))
td,tdend,keytd = map(Suppress,(td,tdend,keytd))

# realnum = Regex(r'1?\.\d+').setParseAction(lambda t:float(t[0]))
# integer = Regex(r'\d{1,3}').setParseAction(lambda t:int(t[0]))
DASH = Suppress('-')

# build up an expression matching the HTML bits above
entryExpr = (keytd + tdend + 
                    Group(2*(td + tdend))("vals"))
                    
# search the input HTML for matches to the entryExpr expression, and build up lookup dict
lookup = {}
for entry in entryExpr.searchString(conn):
    for i in range(entry.start, entry.end+1):

コード例 #35

0

ファイルを表示

ファイル: parser.py プロジェクト: zhouweitong3/linkding

    link = tag[0].link
    description = tag[0].description
    description = description[0] if description else ''

    return {
        'link': link,
        'description': description,
    }


def extract_description(tag):
    return tag[0].strip()


# define grammar
dt_start, _ = pp.makeHTMLTags("DT")
dd_start, _ = pp.makeHTMLTags("DD")
a_start, a_end = pp.makeHTMLTags("A")
bookmark_link_tag = pp.Group(a_start + a_start.tag_body("text") +
                             a_end.suppress())
bookmark_link_tag.addParseAction(extract_bookmark_link)
bookmark_description_tag = dd_start.suppress() + pp.SkipTo(
    pp.anyOpenTag | pp.anyCloseTag)("description")
bookmark_description_tag.addParseAction(extract_description)
bookmark_tag = pp.Group(dt_start + bookmark_link_tag("link") +
                        pp.ZeroOrMore(bookmark_description_tag)("description"))
bookmark_tag.addParseAction(extract_bookmark)


def parse(html: str) -> [NetscapeBookmark]:
    matches = bookmark_tag.searchString(html)

コード例 #36

0

ファイルを表示

ファイル: htmlTableParser.py プロジェクト: slayer/duoauthproxy-freebsd

#
# htmlTableParser.py
#
# Example of parsing a simple HTML table into a list of rows, and optionally into a little database
#
# Copyright 2019, Paul McGuire
#

import pyparsing as pp
import urllib.request

# define basic HTML tags, and compose into a Table
table, table_end = pp.makeHTMLTags("table")
thead, thead_end = pp.makeHTMLTags("thead")
tbody, tbody_end = pp.makeHTMLTags("tbody")
tr, tr_end = pp.makeHTMLTags("tr")
th, th_end = pp.makeHTMLTags("th")
td, td_end = pp.makeHTMLTags("td")
a, a_end = pp.makeHTMLTags("a")

# method to strip HTML tags from a string - will be used to clean up content of table cells
strip_html = (pp.anyOpenTag | pp.anyCloseTag).suppress().transformString

# expression for parsing <a href="url">text</a> links, returning a (text, url) tuple
link = pp.Group(a + a.tag_body("text") + a_end.suppress())


def extract_text_and_url(t):
    return (t[0].text, t[0].href)

コード例 #37

0

ファイルを表示

#
# htmlTableParser.py
#
# Example of parsing a simple HTML table into a list of rows, and optionally into a little database
#
# Copyright 2019, Paul McGuire
#

import pyparsing as pp
import urllib.request


# define basic HTML tags, and compose into a Table
table, table_end = pp.makeHTMLTags('table')
thead, thead_end = pp.makeHTMLTags('thead')
tbody, tbody_end = pp.makeHTMLTags('tbody')
tr, tr_end = pp.makeHTMLTags('tr')
th, th_end = pp.makeHTMLTags('th')
td, td_end = pp.makeHTMLTags('td')
a, a_end = pp.makeHTMLTags('a')

# method to strip HTML tags from a string - will be used to clean up content of table cells
strip_html = (pp.anyOpenTag | pp.anyCloseTag).suppress().transformString

# expression for parsing <a href="url">text</a> links, returning a (text, url) tuple
link = pp.Group(a + pp.SkipTo(a_end)('text') + a_end.suppress())
link.addParseAction(lambda t: (t[0].text, t[0].href))

# method to create table rows of header and data tags
def table_row(start_tag, end_tag):
    body = pp.SkipTo(end_tag)

コード例 #38

0

ファイルを表示

ファイル: getNTPserversNew.py プロジェクト: shehaaz/pytracegraph

# getNTPserversNew.py
#
# Demonstration of the parsing module, implementing a HTML page scanner,
# to extract a list of NTP time servers from the NIST web site.
#
# Copyright 2004-2010, by Paul McGuire
# September, 2010 - updated to more current use of setResultsName, new NIST URL
#
from pyparsing import (Word, Combine, Suppress, SkipTo, nums, makeHTMLTags,
                        delimitedList, alphas, alphanums)
import urllib

integer = Word(nums)
ipAddress = Combine( integer + "." + integer + "." + integer + "." + integer )
hostname = delimitedList(Word(alphas,alphanums+"-_"),".",combine=True)
tdStart,tdEnd = makeHTMLTags("td")
timeServerPattern =  (tdStart + hostname("hostname") + tdEnd + 
                      tdStart + ipAddress("ipAddr") + tdEnd + 
                      tdStart + SkipTo(tdEnd)("loc") + tdEnd)

# get list of time servers
nistTimeServerURL = "http://tf.nist.gov/tf-cgi/servers.cgi#"
serverListPage = urllib.urlopen( nistTimeServerURL )
serverListHTML = serverListPage.read()
serverListPage.close()

addrs = {}
for srvr,startloc,endloc in timeServerPattern.scanString( serverListHTML ):
    print "%s (%s) - %s" % (srvr.ipAddr, srvr.hostname.strip(), srvr.loc.strip())
    addrs[srvr.ipAddr] = srvr.loc

コード例 #39

0

ファイルを表示

ファイル: Yelp_Scrape.py プロジェクト: edgedatascience/Web-Scraping-Scripts

def data_scrape(master_list_of_links):
    prefix = 'http://wwww.yelp.com'
    big_list = []
    for i in range(len(master_list_of_links)):
        time_between_big_links = randint(between_big_links_lower_bound, between_big_links_upper_bound)
        big_link = prefix + master_list_of_links[i]
        print big_link
        print "Scrape initiated"
        soup = link_opener(big_link)
        street = soup.find_all("span", itemprop="streetAddress")
        locality = soup.find_all("span", itemprop="addressLocality")
        state = soup.find_all("span", itemprop="addressRegion")
        zip_code = soup.find_all("span", itemprop="postalCode")
        phone = soup.find_all("span", class_="biz-phone")
        suffix = '?start='
        # review_count specifies how many search pages of reviews you will crawl through. This is set to go through at
        # most 320 reviews
        review_count = ['0', '40', '80', '120', '160', '200', '240', '280', '320']
        for j in review_count:
            time_between_review_pages = randint(between_review_pages_lower_bound, between_review_pages_upper_bound)
            print "processing..."
            new_link = big_link + suffix + j
            soup = link_opener(new_link)
            review_content = soup.find_all("div", class_="review-content")
            if not review_content:
                break
            meta_date = makeHTMLTags('meta')[0]
            meta_date.setParseAction(withAttribute(itemprop="datePublished"))
            meta_rating = makeHTMLTags('meta')[0]
            meta_rating.setParseAction(withAttribute(itemprop="ratingValue"))
            for k in review_content:
                indiv_list = [big_link]
                if not street:
                    indiv_list.append("Missing")
                else:
                    indiv_list.append(street[0].text)
                if not locality:
                    indiv_list.append("Missing")
                else:
                    indiv_list.append(locality[0].text)
                if not state:
                    indiv_list.append("DC")
                else:
                    indiv_list.append(state[0].text)
                if not zip_code:
                    indiv_list.append("Missing")
                else:
                    indiv_list.append(zip_code[0].text)
                if not phone:
                    indiv_list.append("Missing")
                else:
                    indiv_list.append(phone[0].text.strip())
                date = next(meta_date.scanString(k))[0]
                indiv_list.append(date.content)
                stars = next(meta_rating.scanString(k))[0]
                indiv_list.append(stars.content)
                indiv_list.append(k.p.text.encode("utf-8"))
                big_list.append(indiv_list)
            time.sleep(time_between_review_pages)
        print "Scrape complete!"
        time.sleep(time_between_big_links)
        print ""
    return big_list

コード例 #40

0

ファイルを表示

ファイル: urlExtractorNew.py プロジェクト: slayer/duoauthproxy-freebsd

# URL extractor
# Copyright 2004, Paul McGuire
from pyparsing import makeHTMLTags
from urllib.request import urlopen
import pprint

# Define the pyparsing grammar for a URL, that is:
#    URLlink ::= <a href= URL>linkText</a>
#    URL ::= doubleQuotedString | alphanumericWordPath
# Note that whitespace may appear just about anywhere in the link.  Note also
# that it is not necessary to explicitly show this in the pyparsing grammar; by default,
# pyparsing skips over whitespace between tokens.
linkOpenTag, linkCloseTag = makeHTMLTags("a")
link = linkOpenTag + linkOpenTag.tag_body("body") + linkCloseTag.suppress()

# Go get some HTML with some links in it.
with urlopen("https://www.cnn.com/") as serverListPage:
    htmlText = serverListPage.read()

# scanString is a generator that loops through the input htmlText, and for each
# match yields the tokens and start and end locations (for this application, we are
# not interested in the start and end values).
for toks, strt, end in link.scanString(htmlText):
    print(toks.startA.href, "->", toks.body)

# Create dictionary from list comprehension, assembled from each pair of tokens returned
# from a matched URL.
pprint.pprint(
    {toks.body: toks.startA.href for toks, strt, end in link.scanString(htmlText)}
)

コード例 #41

0

ファイルを表示

ファイル: urlExtractor.py プロジェクト: svn2github/pyparsing

# URL extractor
# Copyright 2004, Paul McGuire
from pyparsing import makeHTMLTags, SkipTo, pyparsing_common
import urllib.request
from contextlib import closing
import pprint

linkOpenTag, linkCloseTag = makeHTMLTags('a')

linkBody = SkipTo(linkCloseTag)
linkBody.setParseAction(pyparsing_common.stripHTMLTags)
linkBody.addParseAction(lambda toks: ' '.join(toks[0].strip().split()))

link = linkOpenTag + linkBody("body") + linkCloseTag.suppress()

# Go get some HTML with some links in it.
with closing(urllib.request.urlopen("http://www.yahoo.com")) as serverListPage:
    htmlText = serverListPage.read().decode("UTF-8")

# scanString is a generator that loops through the input htmlText, and for each
# match yields the tokens and start and end locations (for this application, we are
# not interested in the start and end values).
for toks,strt,end in link.scanString(htmlText):
    print(toks.asList())

# Create dictionary from list comprehension, assembled from each pair of tokens returned 
# from a matched URL.
pprint.pprint( 
    dict((toks.body, toks.href) for toks,strt,end in link.scanString(htmlText))
    )