def main(): base = sys.argv[1] fn = sys.argv[2] parser = PyHTMLParser(formatter.NullFormatter(), base, fn, indent=0) print '<?xml version="1.0" encoding="iso-8859-1"?>' print '<book title="Python %s Documentation" name="Python" version="%s" link="index.html">' % ( sys.version[:3], sys.version[:3]) print '<chapters>' parser.parse_file(fn) print '</chapters>' print '<functions>' fn = 'lib/genindex.html' parser = PyIdxHTMLParser(formatter.NullFormatter(), base, fn, indent=1) text = file(base + '/' + fn).read() parser.feed(text) parser.close() fn = 'api/genindex.html' parser = PyIdxHTMLParser(formatter.NullFormatter(), base, fn, indent=1) text = file(base + '/' + fn).read() parser.last_letter = 'letter-v' parser.feed(text) parser.close() print '</functions>' print '</book>'
def __init__(self, verbose=0): self.state = 0 self.data = '' self.status = '' self.meaning = '' f = formatter.NullFormatter() htmllib.HTMLParser.__init__(self, f, verbose)
def handle_html(self): parser=titleparser(formatter.NullFormatter()) parser.feed(self.response.body) if parser.bodytitle[1]: return parser.bodytitle[1] else: return parser.title
def __init__(self, url): """Read and parse a Web page found at the given URL. The URL must be a member of the URL class.""" self.__url = url self.__anchorlist = [] self.__text = '' if not self.__url.isCrawlable() or self.__url.isForbidden(): raise Exception("Attempt to fetch non-crawlable page") html.parser.HTMLParser.__init__(self, formatter.NullFormatter()) self.__rawContents = '' timeout = \ threading.Timer(self.MAX_URL_WAIT_TIME, self.__timeoutHandler) try: timeout.start() urlFile = urllib.request.urlopen(str(url)) self.__rawContents = urlFile.read().decode() except (IOError, KeyboardInterrupt): pass try: urlFile.close() except (IOError, KeyboardInterrupt, UnboundLocalError): pass finally: timeout.cancel() # # Now it's time to parse the page contents # try: self.feed(self.__rawContents) except html.parser.HTMLParseError: pass # We simply ignore HTML errors del self.__rawContents
def __init__(self, verbose=0): self.state = 0 self.dict = {} self.key = '' self.value = '' f = formatter.NullFormatter() htmllib.HTMLParser.__init__(self, f, verbose)
def __init__(self, verbose=0): self.state = 0 self.output = [] self.weatherdata = {} self.last_key = '' f = formatter.NullFormatter() htmllib.HTMLParser.__init__(self, f, verbose)
def __init__(self): htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) self.in_span = False self.in_div = False self.no_user = False self.bad_pw = False self.already_exists = False
def test(args=None): import sys import formatter if not args: args = sys.argv[1:] silent = args and args[0] == '-s' if silent: del args[0] if args: fn = args[0] else: fn = 'test.html' if fn == '-': data = sys.stdin.read() else: try: with open(fn, 'rt') as fh: data = fh.read() except IOError as msg: print(fn, ":", msg) sys.exit(1) if silent: f = formatter.NullFormatter() else: f = formatter.AbstractFormatter(formatter.DumbWriter()) p = HTMLParser(f) p.feed(data) p.close()
def test(args = None): import sys, formatter if not args: args = sys.argv[1:] silent = args and args[0] == '-s' if silent: del args[0] if args: file = args[0] else: file = 'test.html' if file == '-': f = sys.stdin else: try: f = open(file, 'r') except IOError as msg: print file, ':', msg sys.exit(1) data = f.read() if f is not sys.stdin: f.close() if silent: f = formatter.NullFormatter() else: f = formatter.AbstractFormatter(formatter.DumbWriter()) p = HTMLParser(f) p.feed(data) p.close()
def OnGetItemText(self, item, col): f = self.model.config.files[item] if col == 0: return ` item ` elif col == 1: return os.path.basename(f) elif col == 2: return f elif Preferences.hbShowDocumentTitles and col == 3: if not self.cached[item]: title = '' try: if os.path.splitext(f)[1].lower() not in ('.htm', '.html'): return '' docsDir = os.path.dirname(self.model.filename) try: data = Explorer.openEx(os.path.join(docsDir, f)).load() except ExplorerNodes.TransportError: return '' fmtr = formatter.NullFormatter(formatter.NullWriter()) try: HtmlDocDetailParser(fmtr, breakOnTitle=True).feed(data) except BreakOnTitle, title: return str(title) except: return '' else: return '' finally: self.cached[item] = title
def get_links_from_file(filename): with open(filename) as inFile: content = inFile.read() htmlParser = LinksExtractor(formatter.NullFormatter()) htmlParser.feed(content) for link in htmlParser.links: yield link
def __init__(self): self.handle_a = True self.handle_img = True self.regexp = None self.base_href = None self.urls = [] self.output_dir = "." htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
def unescape(s): p = htmllib.HTMLParser(formatter.NullFormatter() ) # we need to preserve line breaks, nofill makes sure we don't # loose them p.nofill = True p.save_bgn() p.feed(s) return p.save_end().strip()
def extract(self,htmldata, docno,url): try: parser = htmllib.HTMLParser(formatter.NullFormatter()) parser.feed(htmldata) return parser.anchorlist except Exception,ex: pass return []
def __init__(self): f = formatter.NullFormatter( ) #formatter.AbstractFormatter(formatter.DumbWriter()) #htmllib.HTMLParser.__init__(self, f) sgmllib.SGMLParser.__init__(self, f) self.SqliteDB = SqliteDatabase(Globals.DBName) self.Stemmer = PorterStemmer() self.ReadStopWords('stopwords.txt') #self.textData = "" #self.BitMap = BitMap #self.WordFrequency = {} #self.splitter = re.compile(r'\W+', re.I) self.splitter = re.compile(r'\s+', re.I) #self.badWords = re.compile(r'.*\\*\/*_*\d+.*\\*\/*_*.*', re.I) self.DigitWord = re.compile(r'\b\d+\b', re.I) self.AlphaNumericWord = re.compile(r'\w+', re.I) #self.doubleSlashes = re.compile(r'\\*', re.I) self.tagType = "" self.REUTERSTOPICS = "" self.LEWISSPLIT = "" self.CGISPLIT = "" self.NEWID = "" self.DATE = "" self.MKNOTE = "" self.TOPICS = "" self.PLACES = "" self.UNKNOWN = "" self.AUTHOR = "" self.DATELINE = "" self.TITLE = "" self.TOPICS = "" self.PLACES = "" self.PEOPLE = "" self.ORGS = "" self.EXCHANGES = "" self.COMPANIES = "" self.TEXTTYPE = "" self.DateHandled = False self.InTagDate = False self.MknoteHandled = False self.InTagMknote = False self.InTagTitle = False self.InTagDateline = False self.InTagBody = False self.InTagTopics = False self.InTagPlaces = False self.InTagPeople = False self.InTagOrgs = False self.InTagExchanges = False self.InTagCompanies = False self.InTagAuthor = False self.InTagUnknown = False
def parse_file(filename, bookname): fd = open(filename) try: p = HTMLParser(formatter.NullFormatter()) p.feed(fd.read()) p.close() except KeyboardInterrupt: raise SystemExit return p.a
def __init__(self, prn=0): if prn: format = formatter.AbstractFormatter(formatter.DumbWriter()) else: format = formatter.NullFormatter() htmllib.HTMLParser.__init__(self, format) self.depth = 0 self.stack = []
def get_page_links(url, proxy_ip, proxy_port): format = formatter.NullFormatter() htmlparser = LinksExtractor(format) page = proxy_get_page(url, proxy_ip, proxy_port) htmlparser.feed(page) htmlparser.close() links = htmlparser.get_links()
def retrieveHTML(text): parser = lchtmllib.LCHTMLParser(formatter.NullFormatter()) try: parser.feed(text) parser.close() except SGMLParseError, e: # SGMLLib seems to die on bad HTML sometimes. (At least with python2.1) log.logger.error('retrieveHTML failed due to SGMLParseError: %s' % str(e))
def __init__(self, verbose=0): htmllib.HTMLParser.__init__(self, formatter.NullFormatter(), verbose) # A list of (title, daysUntilDue) tuples. self.booksDue = [] self.inTitle = False self.title = None self.inStatus = False
def __init__(self, writer, dirn): htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) self.writer = writer self.dirn = dirn self.entry = "" self.desc = "" self.do_entry = False self.one_entry = False self.num_of_a = 0
def urlparser2(): import urllib.request, urllib.parse, formatter from html.parser import HTMLParser response = urllib.request.urlopen(url) data = response.read() response.close() format = formatter.AbstractFormatter(formatter.NullFormatter()) ptext = HTMLParser(format) ptext.feed(data) for link in ptext.anchorlist: print(link)
def main(self, id="785646"): file = urllib.urlopen( "http://www.poker4ever.com/en.tournaments.tournament-statistics?tid=" + id) self.parser = SummaryParser(formatter.NullFormatter()) self.parser.feed(file.read()) print "site=", self.parser.SiteName, "tourneyname=", self.parser.TourneyName, "tourneyid=", self.parser.TourneyId print "start time=", self.parser.TourneyStartTime, "end time=", self.parser.TourneyEndTime print "structure=", self.parser.TourneyStructure, "game type=", self.parser.TourneyGameType print "buy-in=", self.parser.TourneyBuyIn, "rebuys=", self.parser.TourneyRebuys, "total players=", self.parser.TourneysPlayers, "pool=", self.parser.TourneyPool print "results=", self.parser.Results
def __init__(self, url, web_user=None, web_password=None, verbose=0): f = formatter.NullFormatter() self.url = url self.user = web_user self.password = web_password self.links = [] self.mboxes_links = [] htmllib.HTMLParser.__init__(self, f, verbose)
def __init__(self, form_id, verbose=False): f = formatter.NullFormatter() htmllib.HTMLParser.__init__(self) self.form_id = form_id self.getform = False self.select_name = None self.option_value = None self.first_option_value = None # self.formdata = {} self.form = BasicFormData() self.formdata = self.form.form_data
def __init__(self, baseUrl, score): """ :param score: The score of each page, i.e., the number of keywords in that page. :param base_url: The base URL to use for all relative URLs contained within a document. """ htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) self.score = score self.baseUrl = baseUrl # Usage Note: If multiple <base> elements are specified, only the first href and first target value are used; # All others are ignored. self.parsedBaseElement = False
def build_keywords(): data = read_segment( os.path.join(api_path, 'indices.html'), '<!-- =========== START OF IDENTIFIER INDEX =========== -->', '<!-- =========== START OF NAVBAR =========== -->') p = APIIndicesParser(formatter.NullFormatter(formatter.NullWriter())) p.feed(data) hhk = header_hhx+ '<UL>'+os.linesep+\ ''.join([entry_hhx%(u, k) for u, k in p.indices])+os.linesep+'</UL>' open(os.path.join(api_path, api_name + '.hhk'), 'w').write(hhk)
def testLinkExtracting(repositoryListingPage): defaultFormatter = formatter.NullFormatter() extractor = LinksExtractor(defaultFormatter) extractor.feed(repositoryListingPage) extractor.close() for link in extractor.getLinks(): # Currently just checking their existance break else: raise RuntimeError("No links found!")
def parse_file(self, href): # TODO basedir bestimmen parent = os.path.join(self.basedir, self.fn) self.parents.add(parent) parser = PyHTMLParser(formatter.NullFormatter(), self.basedir, href, self.indent + 1, self.parents) text = file(self.basedir + '/' + href).read() parser.feed(text) parser.finish() parser.close() if parent in self.parents: self.parents.remove(parent)
def __init__(self, depth, base_url): """Subclass (inherit) from HTMLParser. :param depth: The depth of each page, i.e., its minimum distance from one of the 10 start pages. :param base_url: The base URL to use for all relative URLs contained within a document. """ htmllib.HTMLParser.__init__(self, formatter.NullFormatter()) self.depth = depth self.base_url = base_url # Usage Note: If multiple <base> elements are specified, only the first href and first target value are used; # All others are ignored. self.has_parsed_base_element = False