# getNTPserversNew.py
#
# Demonstration of the parsing module, implementing a HTML page scanner,
# to extract a list of NTP time servers from the NIST web site.
#
# Copyright 2004-2010, by Paul McGuire
# September, 2010 - updated to more current use of setResultsName, new NIST URL
#
from pyparsingOD import Word, Combine, Suppress, SkipTo, nums, makeHTMLTags, delimitedList, alphas, alphanums
import urllib.request, urllib.parse, urllib.error

integer = Word(nums)
ipAddress = Combine(integer + "." + integer + "." + integer + "." + integer)
hostname = delimitedList(Word(alphas, alphanums + "-_"), ".", combine=True)
tdStart, tdEnd = makeHTMLTags("td")
timeServerPattern = (
    tdStart
    + hostname("hostname")
    + tdEnd
    + tdStart
    + ipAddress("ipAddr")
    + tdEnd
    + tdStart
    + SkipTo(tdEnd)("loc")
    + tdEnd
)

# get list of time servers
nistTimeServerURL = "http://tf.nist.gov/tf-cgi/servers.cgi#"
serverListPage = urllib.request.urlopen(nistTimeServerURL)
serverListHTML = serverListPage.read()
from pyparsingOD import makeHTMLTags, SkipTo, htmlComment
import urllib.request, urllib.parse, urllib.error

serverListPage = urllib.request.urlopen("http://www.yahoo.com")
htmlText = serverListPage.read()
serverListPage.close()

aStart, aEnd = makeHTMLTags("A")

link = aStart + SkipTo(aEnd).setResultsName("link") + aEnd
link.ignore(htmlComment)

for toks, start, end in link.scanString(htmlText):
    print(toks.link, "->", toks.startA.href)
import urllib.request, urllib.parse, urllib.error

from pyparsingOD import makeHTMLTags, SkipTo

# read HTML from a web page
serverListPage = urllib.request.urlopen("http://www.yahoo.com")
htmlText = serverListPage.read()
serverListPage.close()

# using makeHTMLTags to define opening and closing tags
anchorStart, anchorEnd = makeHTMLTags("a")

# compose an expression for an anchored reference
anchor = anchorStart + SkipTo(anchorEnd)("body") + anchorEnd

# use scanString to scan through the HTML source, extracting
# just the anchor tags and their associated body text
# (note the href attribute of the opening A tag is available
# as an attribute in the returned parse results)
for tokens, start, end in anchor.scanString(htmlText):
    print(tokens.body, '->', tokens.href)
Example #4
0
# URL extractor
# Copyright 2004, Paul McGuire
from pyparsingOD import Literal,Suppress,CharsNotIn,CaselessLiteral,\
        Word,dblQuotedString,alphanums,SkipTo,makeHTMLTags
import urllib.request, urllib.parse, urllib.error
import pprint

# Define the pyparsing grammar for a URL, that is:
#    URLlink ::= <a href= URL>linkText</a>
#    URL ::= doubleQuotedString | alphanumericWordPath
# Note that whitespace may appear just about anywhere in the link.  Note also
# that it is not necessary to explicitly show this in the pyparsing grammar; by default,
# pyparsing skips over whitespace between tokens.
linkOpenTag, linkCloseTag = makeHTMLTags("a")
link = linkOpenTag + SkipTo(linkCloseTag).setResultsName(
    "body") + linkCloseTag.suppress()

# Go get some HTML with some links in it.
serverListPage = urllib.request.urlopen("http://www.google.com")
htmlText = serverListPage.read()
serverListPage.close()

# scanString is a generator that loops through the input htmlText, and for each
# match yields the tokens and start and end locations (for this application, we are
# not interested in the start and end values).
for toks, strt, end in link.scanString(htmlText):
    print(toks.startA.href, "->", toks.body)

# Create dictionary from list comprehension, assembled from each pair of tokens returned
# from a matched URL.
pprint.pprint(
# URL extractor
# Copyright 2004, Paul McGuire
from pyparsingOD import Literal,Suppress,CharsNotIn,CaselessLiteral,\
        Word,dblQuotedString,alphanums,SkipTo,makeHTMLTags
import urllib.request, urllib.parse, urllib.error
import pprint

# Define the pyparsing grammar for a URL, that is:
#    URLlink ::= <a href= URL>linkText</a>
#    URL ::= doubleQuotedString | alphanumericWordPath
# Note that whitespace may appear just about anywhere in the link.  Note also
# that it is not necessary to explicitly show this in the pyparsing grammar; by default,
# pyparsing skips over whitespace between tokens.
linkOpenTag,linkCloseTag = makeHTMLTags("a")
link = linkOpenTag + SkipTo(linkCloseTag).setResultsName("body") + linkCloseTag.suppress()

# Go get some HTML with some links in it.
serverListPage = urllib.request.urlopen( "http://www.google.com" )
htmlText = serverListPage.read()
serverListPage.close()

# scanString is a generator that loops through the input htmlText, and for each
# match yields the tokens and start and end locations (for this application, we are
# not interested in the start and end values).
for toks,strt,end in link.scanString(htmlText):
    print(toks.startA.href,"->",toks.body)

# Create dictionary from list comprehension, assembled from each pair of tokens returned 
# from a matched URL.
pprint.pprint( 
    dict( [ (toks.body,toks.startA.href) for toks,strt,end in link.scanString(htmlText) ] )
import urllib.request, urllib.parse, urllib.error

from pyparsingOD import makeHTMLTags, SkipTo

# read HTML from a web page
serverListPage = urllib.request.urlopen( "http://www.yahoo.com" )
htmlText = serverListPage.read()
serverListPage.close()

# using makeHTMLTags to define opening and closing tags
anchorStart,anchorEnd = makeHTMLTags("a")

# compose an expression for an anchored reference
anchor = anchorStart + SkipTo(anchorEnd)("body") + anchorEnd

# use scanString to scan through the HTML source, extracting
# just the anchor tags and their associated body text
# (note the href attribute of the opening A tag is available
# as an attribute in the returned parse results)
for tokens,start,end in anchor.scanString(htmlText):
    print(tokens.body,'->',tokens.href)
Example #7
0
from pyparsingOD import makeHTMLTags, SkipTo, htmlComment
import urllib.request, urllib.parse, urllib.error

serverListPage = urllib.request.urlopen("http://www.yahoo.com")
htmlText = serverListPage.read()
serverListPage.close()

aStart, aEnd = makeHTMLTags("A")

link = aStart + SkipTo(aEnd).setResultsName("link") + aEnd
link.ignore(htmlComment)

for toks, start, end in link.scanString(htmlText):
    print(toks.link, "->", toks.startA.href)
# getNTPserversNew.py
#
# Demonstration of the parsing module, implementing a HTML page scanner,
# to extract a list of NTP time servers from the NIST web site.
#
# Copyright 2004-2010, by Paul McGuire
# September, 2010 - updated to more current use of setResultsName, new NIST URL
#
from pyparsingOD import (Word, Combine, Suppress, SkipTo, nums, makeHTMLTags,
                        delimitedList, alphas, alphanums)
import urllib.request, urllib.parse, urllib.error

integer = Word(nums)
ipAddress = Combine( integer + "." + integer + "." + integer + "." + integer )
hostname = delimitedList(Word(alphas,alphanums+"-_"),".",combine=True)
tdStart,tdEnd = makeHTMLTags("td")
timeServerPattern =  (tdStart + hostname("hostname") + tdEnd + 
                      tdStart + ipAddress("ipAddr") + tdEnd + 
                      tdStart + SkipTo(tdEnd)("loc") + tdEnd)

# get list of time servers
nistTimeServerURL = "http://tf.nist.gov/tf-cgi/servers.cgi#"
serverListPage = urllib.request.urlopen( nistTimeServerURL )
serverListHTML = serverListPage.read()
serverListPage.close()

addrs = {}
for srvr,startloc,endloc in timeServerPattern.scanString( serverListHTML ):
    print("%s (%s) - %s" % (srvr.ipAddr, srvr.hostname.strip(), srvr.loc.strip()))
    addrs[srvr.ipAddr] = srvr.loc