def main():
    base = sys.argv[1]
    fn = sys.argv[2]

    parser = PyHTMLParser(formatter.NullFormatter(), base, fn, indent=0)
    print '<?xml version="1.0" encoding="iso-8859-1"?>'
    print '<book title="Python %s Documentation" name="Python" version="%s" link="index.html">' % (
        sys.version[:3], sys.version[:3])
    print '<chapters>'
    parser.parse_file(fn)
    print '</chapters>'

    print '<functions>'

    fn = 'lib/genindex.html'
    parser = PyIdxHTMLParser(formatter.NullFormatter(), base, fn, indent=1)
    text = file(base + '/' + fn).read()
    parser.feed(text)
    parser.close()

    fn = 'api/genindex.html'
    parser = PyIdxHTMLParser(formatter.NullFormatter(), base, fn, indent=1)
    text = file(base + '/' + fn).read()
    parser.last_letter = 'letter-v'
    parser.feed(text)
    parser.close()

    print '</functions>'
    print '</book>'
Example #2
0
 def __init__(self, verbose=0):
     self.state = 0
     self.data = ''
     self.status = ''
     self.meaning = ''
     f = formatter.NullFormatter()
     htmllib.HTMLParser.__init__(self, f, verbose)
Example #3
0
 def handle_html(self):
     parser=titleparser(formatter.NullFormatter())
     parser.feed(self.response.body)
     if parser.bodytitle[1]:
         return parser.bodytitle[1]
     else:
         return parser.title
Example #4
0
 def __init__(self, url):
     """Read and parse a Web page found at the given URL.  The URL must
        be a member of the URL class."""
     self.__url = url
     self.__anchorlist = []
     self.__text = ''
     if not self.__url.isCrawlable() or  self.__url.isForbidden():
         raise Exception("Attempt to fetch non-crawlable page")
     html.parser.HTMLParser.__init__(self, formatter.NullFormatter())
     self.__rawContents = ''
     timeout = \
       threading.Timer(self.MAX_URL_WAIT_TIME, self.__timeoutHandler)
     try:
         timeout.start()
         urlFile = urllib.request.urlopen(str(url))
         self.__rawContents = urlFile.read().decode()
     except (IOError, KeyboardInterrupt):
         pass
     try:
         urlFile.close()
     except (IOError, KeyboardInterrupt, UnboundLocalError):
         pass
     finally:
         timeout.cancel()
     #
     # Now it's time to parse the page contents
     #
     try:
         self.feed(self.__rawContents)
     except html.parser.HTMLParseError:
         pass                # We simply ignore HTML errors
     del self.__rawContents
Example #5
0
 def __init__(self, verbose=0):
     self.state = 0
     self.dict = {}
     self.key = ''
     self.value = ''
     f = formatter.NullFormatter()
     htmllib.HTMLParser.__init__(self, f, verbose)
Example #6
0
 def __init__(self, verbose=0):
     self.state = 0
     self.output = []
     self.weatherdata = {}
     self.last_key = ''
     f = formatter.NullFormatter()
     htmllib.HTMLParser.__init__(self, f, verbose)
Example #7
0
 def __init__(self):
     htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
     self.in_span = False
     self.in_div = False
     self.no_user = False
     self.bad_pw = False
     self.already_exists = False
Example #8
0
def test(args=None):
    import sys
    import formatter

    if not args:
        args = sys.argv[1:]

    silent = args and args[0] == '-s'
    if silent:
        del args[0]

    if args:
        fn = args[0]
    else:
        fn = 'test.html'

    if fn == '-':
        data = sys.stdin.read()
    else:
        try:
            with open(fn, 'rt') as fh:
                data = fh.read()
        except IOError as msg:
            print(fn, ":", msg)
            sys.exit(1)

    if silent:
        f = formatter.NullFormatter()
    else:
        f = formatter.AbstractFormatter(formatter.DumbWriter())

    p = HTMLParser(f)
    p.feed(data)
    p.close()
Example #9
0
def test(args = None):
    import sys, formatter
    if not args:
        args = sys.argv[1:]
    silent = args and args[0] == '-s'
    if silent:
        del args[0]
    if args:
        file = args[0]
    else:
        file = 'test.html'
    if file == '-':
        f = sys.stdin
    else:
        try:
            f = open(file, 'r')
        except IOError as msg:
            print file, ':', msg
            sys.exit(1)

    data = f.read()
    if f is not sys.stdin:
        f.close()
    if silent:
        f = formatter.NullFormatter()
    else:
        f = formatter.AbstractFormatter(formatter.DumbWriter())
    p = HTMLParser(f)
    p.feed(data)
    p.close()
Example #10
0
 def OnGetItemText(self, item, col):
     f = self.model.config.files[item]
     if col == 0:
         return ` item `
     elif col == 1:
         return os.path.basename(f)
     elif col == 2:
         return f
     elif Preferences.hbShowDocumentTitles and col == 3:
         if not self.cached[item]:
             title = ''
             try:
                 if os.path.splitext(f)[1].lower() not in ('.htm', '.html'):
                     return ''
                 docsDir = os.path.dirname(self.model.filename)
                 try:
                     data = Explorer.openEx(os.path.join(docsDir, f)).load()
                 except ExplorerNodes.TransportError:
                     return ''
                 fmtr = formatter.NullFormatter(formatter.NullWriter())
                 try:
                     HtmlDocDetailParser(fmtr, breakOnTitle=True).feed(data)
                 except BreakOnTitle, title:
                     return str(title)
                 except:
                     return ''
                 else:
                     return ''
             finally:
                 self.cached[item] = title
Example #11
0
def get_links_from_file(filename):
    with open(filename) as inFile:
        content = inFile.read()

    htmlParser = LinksExtractor(formatter.NullFormatter())
    htmlParser.feed(content)
    for link in htmlParser.links:
        yield link
Example #12
0
 def __init__(self):
     self.handle_a = True
     self.handle_img = True
     self.regexp = None
     self.base_href = None
     self.urls = []
     self.output_dir = "."
     htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
Example #13
0
def unescape(s):
    p = htmllib.HTMLParser(formatter.NullFormatter() )
    # we need to preserve line breaks, nofill makes sure we don't
    # loose them
    p.nofill = True
    p.save_bgn()
    p.feed(s)
    return p.save_end().strip()
Example #14
0
 def extract(self,htmldata, docno,url):
     try:
         parser = htmllib.HTMLParser(formatter.NullFormatter())
         parser.feed(htmldata)
         return parser.anchorlist
     except Exception,ex:
         pass
         return []
Example #15
0
    def __init__(self):

        f = formatter.NullFormatter(
        )  #formatter.AbstractFormatter(formatter.DumbWriter())
        #htmllib.HTMLParser.__init__(self, f)
        sgmllib.SGMLParser.__init__(self, f)
        self.SqliteDB = SqliteDatabase(Globals.DBName)

        self.Stemmer = PorterStemmer()

        self.ReadStopWords('stopwords.txt')

        #self.textData = ""
        #self.BitMap = BitMap
        #self.WordFrequency = {}
        #self.splitter = re.compile(r'\W+', re.I)
        self.splitter = re.compile(r'\s+', re.I)
        #self.badWords = re.compile(r'.*\\*\/*_*\d+.*\\*\/*_*.*', re.I)
        self.DigitWord = re.compile(r'\b\d+\b', re.I)
        self.AlphaNumericWord = re.compile(r'\w+', re.I)
        #self.doubleSlashes = re.compile(r'\\*', re.I)
        self.tagType = ""
        self.REUTERSTOPICS = ""
        self.LEWISSPLIT = ""
        self.CGISPLIT = ""
        self.NEWID = ""
        self.DATE = ""
        self.MKNOTE = ""
        self.TOPICS = ""
        self.PLACES = ""
        self.UNKNOWN = ""
        self.AUTHOR = ""
        self.DATELINE = ""
        self.TITLE = ""
        self.TOPICS = ""
        self.PLACES = ""
        self.PEOPLE = ""
        self.ORGS = ""
        self.EXCHANGES = ""
        self.COMPANIES = ""
        self.TEXTTYPE = ""

        self.DateHandled = False
        self.InTagDate = False
        self.MknoteHandled = False

        self.InTagMknote = False
        self.InTagTitle = False
        self.InTagDateline = False
        self.InTagBody = False
        self.InTagTopics = False
        self.InTagPlaces = False
        self.InTagPeople = False
        self.InTagOrgs = False
        self.InTagExchanges = False
        self.InTagCompanies = False
        self.InTagAuthor = False
        self.InTagUnknown = False
Example #16
0
def parse_file(filename, bookname):
    fd = open(filename)
    try:
        p = HTMLParser(formatter.NullFormatter())
        p.feed(fd.read())
        p.close()
    except KeyboardInterrupt:
        raise SystemExit
    return p.a
Example #17
0
    def __init__(self, prn=0):
        if prn:
            format = formatter.AbstractFormatter(formatter.DumbWriter())
        else:
            format = formatter.NullFormatter()

        htmllib.HTMLParser.__init__(self, format)
        self.depth = 0
        self.stack = []
Example #18
0
def get_page_links(url, proxy_ip, proxy_port):
    format = formatter.NullFormatter()
    htmlparser = LinksExtractor(format)
    page = proxy_get_page(url, proxy_ip, proxy_port)

    htmlparser.feed(page)
    htmlparser.close()

    links = htmlparser.get_links()
Example #19
0
def retrieveHTML(text):
    parser = lchtmllib.LCHTMLParser(formatter.NullFormatter())
    try:
        parser.feed(text)
        parser.close()
    except SGMLParseError, e:
        # SGMLLib seems to die on bad HTML sometimes. (At least with python2.1)
        log.logger.error('retrieveHTML failed due to SGMLParseError: %s' %
                         str(e))
Example #20
0
    def __init__(self, verbose=0):
        htmllib.HTMLParser.__init__(self, formatter.NullFormatter(), verbose)

        # A list of (title, daysUntilDue) tuples.
        self.booksDue = []

        self.inTitle = False
        self.title = None
        self.inStatus = False
Example #21
0
    def __init__(self, writer, dirn):
        htmllib.HTMLParser.__init__(self, formatter.NullFormatter())

        self.writer = writer
        self.dirn = dirn
        self.entry = ""
        self.desc = ""
        self.do_entry = False
        self.one_entry = False
        self.num_of_a = 0
def urlparser2():
    import urllib.request, urllib.parse, formatter
    from html.parser import HTMLParser
    response = urllib.request.urlopen(url)
    data = response.read()
    response.close()
    format = formatter.AbstractFormatter(formatter.NullFormatter())
    ptext = HTMLParser(format)
    ptext.feed(data)
    for link in ptext.anchorlist:
        print(link)
Example #23
0
 def main(self, id="785646"):
     file = urllib.urlopen(
         "http://www.poker4ever.com/en.tournaments.tournament-statistics?tid="
         + id)
     self.parser = SummaryParser(formatter.NullFormatter())
     self.parser.feed(file.read())
     print "site=", self.parser.SiteName, "tourneyname=", self.parser.TourneyName, "tourneyid=", self.parser.TourneyId
     print "start time=", self.parser.TourneyStartTime, "end time=", self.parser.TourneyEndTime
     print "structure=", self.parser.TourneyStructure, "game type=", self.parser.TourneyGameType
     print "buy-in=", self.parser.TourneyBuyIn, "rebuys=", self.parser.TourneyRebuys, "total players=", self.parser.TourneysPlayers, "pool=", self.parser.TourneyPool
     print "results=", self.parser.Results
    def __init__(self, url, web_user=None, web_password=None, verbose=0):

        f = formatter.NullFormatter()

        self.url = url
        self.user = web_user
        self.password = web_password
        self.links = []
        self.mboxes_links = []

        htmllib.HTMLParser.__init__(self, f, verbose)
Example #25
0
 def __init__(self, form_id, verbose=False):
     f = formatter.NullFormatter()
     htmllib.HTMLParser.__init__(self)
     self.form_id = form_id
     self.getform = False
     self.select_name = None
     self.option_value = None
     self.first_option_value = None
     #        self.formdata = {}
     self.form = BasicFormData()
     self.formdata = self.form.form_data
Example #26
0
 def __init__(self, baseUrl, score):
     """
     :param score: The score of each page, i.e., the number of keywords in that page.
     :param base_url: The base URL to use for all relative URLs contained within a document.
     """
     htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
     self.score = score
     self.baseUrl = baseUrl
     # Usage Note: If multiple <base> elements are specified, only the first href and first target value are used;
     # All others are ignored.
     self.parsedBaseElement = False
Example #27
0
def build_keywords():
    data = read_segment(
        os.path.join(api_path, 'indices.html'),
        '<!-- =========== START OF IDENTIFIER INDEX =========== -->',
        '<!-- =========== START OF NAVBAR =========== -->')
    p = APIIndicesParser(formatter.NullFormatter(formatter.NullWriter()))
    p.feed(data)

    hhk = header_hhx+ '<UL>'+os.linesep+\
          ''.join([entry_hhx%(u, k) for u, k in p.indices])+os.linesep+'</UL>'
    open(os.path.join(api_path, api_name + '.hhk'), 'w').write(hhk)
def testLinkExtracting(repositoryListingPage):
    defaultFormatter = formatter.NullFormatter()
    extractor = LinksExtractor(defaultFormatter)
    extractor.feed(repositoryListingPage)
    extractor.close()

    for link in extractor.getLinks():
        # Currently just checking their existance
        break
    else:
        raise RuntimeError("No links found!")
Example #29
0
 def parse_file(self, href):
     # TODO basedir bestimmen
     parent = os.path.join(self.basedir, self.fn)
     self.parents.add(parent)
     parser = PyHTMLParser(formatter.NullFormatter(), self.basedir, href,
                           self.indent + 1, self.parents)
     text = file(self.basedir + '/' + href).read()
     parser.feed(text)
     parser.finish()
     parser.close()
     if parent in self.parents:
         self.parents.remove(parent)
Example #30
0
    def __init__(self, depth, base_url):
        """Subclass (inherit) from HTMLParser.

        :param depth: The depth of each page, i.e., its minimum distance from one of the 10 start pages.
        :param base_url: The base URL to use for all relative URLs contained within a document.
        """
        htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
        self.depth = depth
        self.base_url = base_url
        # Usage Note: If multiple <base> elements are specified, only the first href and first target value are used;
        # All others are ignored.
        self.has_parsed_base_element = False