Python HTMLParserの例、HTMLParser.HTMLParser Pythonの例

コード例 #1

0

ファイルを表示

ファイル: Page1Parser.py プロジェクト: drankincms/RateMon

 def __init__(self):
     HTMLParser.__init__(self)
     self.InRow=0
     self.InEntry=0
     self.table =  []
     self.tmpRow = []
     self.hyperlinks = []
     self.RunNumber = 0
     self.TriggerRates = []
     self.Nevts = []
     self.LiveLumiByLS = []
     self.DeliveredLumiByLS = []
     self.FirstLS = -1
     self.LastLS = -1
     self.AvLiveLumi = []
     self.AvDeliveredLumi = []
     self.AvDeadtime = []
     self.DeadTime = []#grant
     self.L1Prescales=[]
     self.RunPage = ''
     self.RatePage = ''
     self.LumiPage = ''
     self.L1Page=''
     self.L1_LS_Page = ''#grant
     self.PrescaleColumn=[]
     self.PrescaleColumnString = ''

コード例 #2

0

ファイルを表示

ファイル: parsers.py プロジェクト: a1exsh/pgweb

	def __init__(self):
		HTMLParser.__init__(self)
		self.lasttag = None
		self.title = ""
		self.pagedata = StringIO()
		self.links = []
		self.inbody = False

コード例 #3

0

ファイルを表示

ファイル: cvedetails.py プロジェクト: LeGaulois/soc

    def __init__(self):
        HTMLParser.__init__(self)
        self.trouve=False
        self.encours=False
        self.reponse={}
        self.correspondance={
            'CVSS Score':'cvss_score',
            'Confidentiality Impact':'confidentialite',
            'Integrity Impact':'integrite',
            'Availability Impact':'disponibilite',
            'Access Complexity':'complexite',
            'Authentication':'authentification',
            'Vulnerability Type(s)':'type',
            'CWE ID':None,
            'Gained Access':'acces_obtention'
        }

        self.reponse={
            'cvss_score':None,
            'confidentialite':None,
            'integrite':None,
            'disponibilite':None,
            'complexite':None,
            'authentification':None,
            'type':None,
            'acces_obtention':None
        }

        self.precedent=None

コード例 #4

0

ファイルを表示

ファイル: parseLastFm.py プロジェクト: charlie-kim/lastArchive

 def __init__(self):
     HTMLParser.__init__(self)
     self.tracks = []
     self.endDate = None
     self.curData = ''
     self.curTrack = {}
     self.recording = None

コード例 #5

0

ファイルを表示

ファイル: preferences.py プロジェクト: tuxnani/pyrhmn

    def __init__(self, site = None):
        HTMLParser.__init__(self)
        dict.__init__(self, ())
        self.in_form = False
        self.select = None

        if site: self.load(site)

コード例 #6

0

ファイルを表示

ファイル: parser.py プロジェクト: PollRobots/pyxl

    def feed(self, token):
        ttype, tvalue, tstart, tend, tline = token
        self.line = tline

        # Handle whitespace
        (prev_row, prev_col) = self.lastPos
        (cur_row, cur_col) = tstart
        (end_row, end_col) = tend

        assert cur_row >= prev_row, "Unexpected jump in row"
        self.lastPos = (end_row, end_col)

        # are we now on a new line?
        if cur_row > prev_row:
            self._appendRows(cur_row - prev_row)

        # are we on a muliline statement?
        if end_row > cur_row:
            self._appendRows(end_row - cur_row)

        # interpret jumps on the same line as a single space
        if cur_row == prev_row and cur_col > prev_col:
            HTMLParser.feed(self, ' ')

        HTMLParser.feed(self, tvalue)

コード例 #7

0

ファイルを表示

ファイル: scraper_pl_3.py プロジェクト: abhaystoic/scrapers

 def get_jobs(self):
     try:
         jobs_start_time = time.time()
         h = HTMLParser()
         html = h.unescape(self.browser.page_source).encode('utf-8').decode('ascii', 'ignore')
         soup = BeautifulSoup(html, 'html.parser')
         data = soup.findAll('a', id=lambda x: x and x.startswith('popup'))
         counter = 0
         for a in data:
             if a.has_attr('href'):
                 counter = counter + 1
                 #self.DrawSpinner(counter)
                 try:
                     return_code = self.get_job_info(self.browser, self.base_job_url + a['href'].split('?')[1])
                     if return_code == 1:
                         #In case the error pages starts to come
                         jobs_end_time = time.time()
                         print 'All jobs scraping time =', str(jobs_end_time - jobs_start_time)
                         return
                         
                 except Exception:
                     continue
         jobs_end_time = time.time()
         print 'All jobs scraping time =', str(jobs_end_time - jobs_start_time)
     except Exception as e:
         print 'exception= ', str(e)
         #print 'stacktrace= ', traceback.print_exc()
         print 'Line Number= ' + str(sys.exc_traceback.tb_lineno)

コード例 #8

0

ファイルを表示

ファイル: vk_auth.py プロジェクト: grishasobol/VKFinder

 def __init__(self):
     HTMLParser.__init__(self)
     self.url = None
     self.params = {}
     self.in_form = False
     self.form_parsed = False
     self.method = "GET"

コード例 #9

0

ファイルを表示

ファイル: fetch_events.py プロジェクト: C4ptainCrunch/hackeragenda

def wolfplex(options):
    # clean events
    Event.objects.filter(source="wolfplex").delete()

    html_parser = HTMLParser()

    soup = BeautifulSoup(urlopen("http://www.wolfplex.org/wiki/Main_Page").read())

    events = soup.find("div", id="accueil-agenda").dl

    for date_info, event in zip(events('dt'), events('dd')[1::2]):
        if event.span:
            event.span.clear()

        title = html_parser.unescape(event.text)
        base_domain = "http://www.wolfplex.org" if not event.a["href"].startswith("http") else ""
        url = (base_domain + event.a["href"]) if event.a else "http://www.wolfplex.org"
        start = parse(date_info.span["title"])

        if "@" in title:
            title, location = title.split("@", 1)
        else:
            location = None

        Event.objects.create(
            title=title,
            source="wolfplex",
            url=url,
            start=start,
            location=location
        )

        if not options["quiet"]:
            print "Adding %s [%s] (%s)..." % (title.encode("Utf-8"), "wolfplex", location.encode("Utf-8") if location else "")

コード例 #10

0

ファイルを表示

ファイル: bugzillaSummaryParser.py プロジェクト: drdee/bingle

 def __init__(self, properties):
     HTMLParser.__init__(self)
     self.properties = dict((key, value) for key, value in (prop.split(',')
                            for prop in properties.split(';') if prop.find(',') > -1))
     self.data = []
     self.in_td = 0
     self.tr_name = None

コード例 #11

0

ファイルを表示

ファイル: htmlutils.py プロジェクト: AlbertoPeon/invenio

 def __init__(self):
     """ Constructor; initializes washer """
     HTMLParser.__init__(self)
     self.result = ''
     self.nb = 0
     self.previous_nbs = []
     self.previous_type_lists = []
     self.url = ''
     self.render_unallowed_tags = False
     self.allowed_tag_whitelist = \
             CFG_HTML_BUFFER_ALLOWED_TAG_WHITELIST
     self.allowed_attribute_whitelist = \
             CFG_HTML_BUFFER_ALLOWED_ATTRIBUTE_WHITELIST
     # javascript:
     self.re_js = re.compile( ".*(j|&#106;|&#74;)"\
                             "\s*(a|&#97;|&#65;)"\
                             "\s*(v|&#118;|&#86;)"\
                             "\s*(a|&#97;|&#65;)"\
                             "\s*(s|&#115;|&#83;)"\
                             "\s*(c|&#99;|&#67;)"\
                             "\s*(r|&#114;|&#82;)"\
                             "\s*(i|&#195;|&#73;)"\
                             "\s*(p|&#112;|&#80;)"\
                             "\s*(t|&#112;|&#84)"\
                             "\s*(:|&#58;).*", re.IGNORECASE | re.DOTALL)
     # vbscript:
     self.re_vb = re.compile( ".*(v|&#118;|&#86;)"\
                             "\s*(b|&#98;|&#66;)"\
                             "\s*(s|&#115;|&#83;)"\
                             "\s*(c|&#99;|&#67;)"\
                             "\s*(r|&#114;|&#82;)"\
                             "\s*(i|&#195;|&#73;)"\
                             "\s*(p|&#112;|&#80;)"\
                             "\s*(t|&#112;|&#84;)"\
                             "\s*(:|&#58;).*", re.IGNORECASE | re.DOTALL)

コード例 #12

0

ファイルを表示

ファイル: test_converter.py プロジェクト: Comcast/WebKitForWayland

    def __init__(self, new_path, filename, reference_support_info, host=Host(), convert_test_harness_links=True):
        HTMLParser.__init__(self)

        self._host = host
        self._filesystem = self._host.filesystem
        self._webkit_root = WebKitFinder(self._filesystem).webkit_base()

        self.converted_data = []
        self.converted_properties = []
        self.converted_property_values = []
        self.in_style_tag = False
        self.style_data = []
        self.filename = filename
        self.reference_support_info = reference_support_info

        resources_path = self.path_from_webkit_root('LayoutTests', 'resources')
        resources_relpath = self._filesystem.relpath(resources_path, new_path)
        self.new_test_harness_path = resources_relpath
        self.convert_test_harness_links = convert_test_harness_links

        # These settings might vary between WebKit and Blink
        self._css_property_file = self.path_from_webkit_root('Source', 'WebCore', 'css', 'CSSPropertyNames.in')
        self._css_property_value_file = self.path_from_webkit_root('Source', 'WebCore', 'css', 'CSSValueKeywords.in')

        self.test_harness_re = re.compile('/resources/testharness')

        self.prefixed_properties = self.read_webkit_prefixed_css_property_list(self._css_property_file)
        prop_regex = '([\s{]|^)(' + "|".join(prop.replace('-webkit-', '') for prop in self.prefixed_properties) + ')(\s+:|:)'
        self.prop_re = re.compile(prop_regex)

        self.prefixed_property_values = self.read_webkit_prefixed_css_property_list(self._css_property_value_file)
        prop_value_regex = '(:\s*|^\s*)(' + "|".join(value.replace('-webkit-', '') for value in self.prefixed_property_values) + ')(\s*;|\s*}|\s*$)'
        self.prop_value_re = re.compile(prop_value_regex)

コード例 #13

0

ファイルを表示

ファイル: deedScraperLib.py プロジェクト: antievictionmappingproject/deedscraper

 def __init__(self):
     HTMLParser.__init__(self)
     self.in_records_table = False
     self.record = -1
     self.column = -1
     self.data_row = False
     self.data = []

コード例 #14

0

ファイルを表示

ファイル: MyParser.py プロジェクト: blxlrsmb/uknow

	def __init__(self, feed_data):
		HTMLParser.__init__(self)
		self.courses = tuple()
		self.is_course = False
		self.is_coursename = False
		self.is_homework = False
		self.feed(feed_data)

コード例 #15

0

ファイルを表示

ファイル: utilclasses.py プロジェクト: jnoga/iTxoko-backend

	def __init__(self):
		HTMLParser.__init__(self)
		self.subjectList = {}
		self.tagi = 0
		self.tdi = 0
		self.dataFlag = 0
		self.subName = ""

コード例 #16

0

ファイルを表示

ファイル: xkcd_manager.py プロジェクト: ianadmu/bolton_bot

def getImageLocation(comicRequest):

    titleString = 'id="ctitle">'
    captionString = 'title="'
    imageString = '//imgs.xkcd.com/comics/'

    response = urllib2.urlopen(parseComicRequest(comicRequest))
    html = response.read()

    titleStart = html.find(titleString) + len(titleString)
    titleEnd = html[titleStart:].find('<') + titleStart
    title = html[titleStart:titleEnd]

    imageAddressStart = html.find(imageString)
    imageAddressEnd = html[imageAddressStart:].find('"') + imageAddressStart
    imageAddress = html[imageAddressStart:imageAddressEnd]

    captionStart = (
        html[imageAddressEnd:].find(captionString) + imageAddressEnd +
        len(captionString)
    )
    captionEnd = html[captionStart:].find('"') + captionStart
    caption = html[captionStart:captionEnd]

    parser = HTMLParser()
    caption = parser.unescape(caption)
    title = parser.unescape(title)

    return '*' + title + "*\nhttp:" + str(imageAddress) + '\n' + caption

コード例 #17

0

ファイルを表示

ファイル: htmlparser.py プロジェクト: serkanh/scrapy

    def __init__(self, tag="a", attr="href", process=None, unique=False):
        HTMLParser.__init__(self)

        self.scan_tag = tag if callable(tag) else lambda t: t == tag
        self.scan_attr = attr if callable(attr) else lambda a: a == attr
        self.process_attr = process if callable(process) else lambda v: v
        self.unique = unique

コード例 #18

0

ファイルを表示

ファイル: tangshi_list_v2.py プロジェクト: AliceSaraOtt/PythonPros

 def __init__(self):
     HTMLParser.__init__(self)
     self.in_div = False
     self.in_a = False
     self.pattern = re.compile(r'(.*)\((.*)\)')
     self.tangshi_list = []
     self.current_poem = {}

コード例 #19

0

ファイルを表示

ファイル: check_data.py プロジェクト: wusui/toybox

 def __init__(self):
     HTMLParser.__init__(self)
     self.glink = False
     self.elink = False
     self.ingroup = []
     self.href = ''
     self.name = ''

コード例 #20

0

ファイルを表示

ファイル: gamescraper.py プロジェクト: tomshen/steamscraper

	def __init__(self):
		HTMLParser.__init__(self)
		self.foundName = False
		self.foundDescription = False
		self.foundPrice = False
		self.foundScore = True
		self.gameInfo = {}

コード例 #21

0

ファイルを表示

ファイル: HTMLTreeBuilder.py プロジェクト: Annatara/nimbus

 def __init__(self, builder=None, encoding=None):
     self.__stack = []
     if builder is None:
         builder = ElementTree.TreeBuilder()
     self.__builder = builder
     self.encoding = encoding or "iso-8859-1"
     HTMLParser.__init__(self)

コード例 #22

0

ファイルを表示

ファイル: renrenAlbumDownloader.py プロジェクト: zeroomega/renrenAlbumDownloader

 def __init__(self):
     #super(formParser, self).__init__()
     HTMLParser.__init__(self)
     self.dict = {}
     self.stack = []
     self.post=""
     pass

コード例 #23

0

ファイルを表示

ファイル: handlers.py プロジェクト: asealey/crits

def update_event_description(event_id, description, analyst):
    """
    Update event description.

    :param event_id: The ObjectId of the Event to update.
    :type event_id: str
    :param description: The new description.
    :type description: str
    :param analyst: The user updating this Event.
    :type analyst: str
    :returns: dict with keys "success" (boolean) and "message" (str)
    """

    if not description:
        return {'success': False, 'message': "No description to change"}
    event = Event.objects(id=event_id).first()
    if not event:
        return {'success': False, 'message': "No event found"}
    # Have to unescape the submitted data. Use unescape() to escape
    # &lt; and friends. Use urllib2.unquote() to escape %3C and friends.
    h = HTMLParser()
    description = h.unescape(description)
    event.description = description
    try:
        event.save(username=analyst)
        return {'success': True}
    except ValidationError, e:
        return {'success': False, 'message': e}

コード例 #24

0

ファイルを表示

ファイル: sami.py プロジェクト: MattTuttle/pycaption

    def feed(self, data):
        no_cc = u'no closed captioning available'

        if u'<html' in data.lower():
            raise CaptionReadSyntaxError(u'SAMI File seems to be an HTML file.')
        elif no_cc in data.lower():
            raise CaptionReadSyntaxError(u'SAMI File contains "%s"' % no_cc)

        # try to find style tag in SAMI
        try:
            # prevent BS4 error with huge SAMI files with unclosed tags
            index = data.lower().find(u"</head>")

            self.styles = self._css_parse(
                BeautifulSoup(data[:index]).find(u'style').get_text())
        except AttributeError:
            self.styles = {}

        # fix erroneous italics tags
        data = data.replace(u'<i/>', u'<i>')

        # fix awkward tags found in some SAMIs
        data = data.replace(u';>', u'>')
        try:
            HTMLParser.feed(self, data)
        except HTMLParseError as e:
            raise CaptionReadSyntaxError(e)

        # close any tags that remain in the queue
        while self.queue != deque([]):
            closing_tag = self.queue.pop()
            self.sami += u"</%s>" % closing_tag

        return self.sami, self.styles, self.langs

コード例 #25

0

ファイルを表示

ファイル: html_source.py プロジェクト: AndrewHay/lesswrong

 def __init__(self, *a, **kw):
     self.indent = '';
     HTMLParser.__init__(self, *a, **kw)
     self.processed_text = ''
     self.tagtracker = []
     self.error_line = 0
     self.line_number = 1

コード例 #26

0

ファイルを表示

ファイル: html.py プロジェクト: edugasser/misEjemplos

 def __init__(self, pdf):
     HTMLParser.__init__(self)
     self.style = {}
     self.pre = False
     self.href = ''
     self.align = ''
     self.page_links = {}
     self.font_list = ("times","courier", "helvetica")
     self.font = None
     self.font_stack = [] 
     self.pdf = pdf
     self.r = self.g = self.b = 0
     self.indent = 0
     self.bullet = []
     self.set_font("times", 12)
     self.font_face = "times"    # initialize font
     self.color = 0              #initialize font color
     self.table = None           # table attributes
     self.table_col_width = None # column (header) widths
     self.table_col_index = None # current column index
     self.td = None              # cell attributes
     self.th = False             # header enabled
     self.tr = None
     self.theader = None           # table header cells
     self.tfooter = None           # table footer cells
     self.thead = None
     self.tfoot = None
     self.theader_out = self.tfooter_out = False

コード例 #27

0

ファイルを表示

ファイル: WunschlisteFeed.py プロジェクト: OpenEnigma2/enigma2-plugins

	def __init__(self):
		HTMLParser.__init__(self)
		self.title = False
		self.updated = False
		self.titlestr = ''
		self.updatedstr = ''
		self.list = []

コード例 #28

0

ファイルを表示

ファイル: sonostoppoint.py プロジェクト: Open-Transport/synthese

 def feed(self, data):
   from HTMLParser import HTMLParser
   data_with_br = data.replace("\n", "<br/>")
   HTMLParser.feed(self, data_with_br)
   if len(self.current_line) > 0:
     self.lines.append(self.current_line)
     self.current_line = ''

コード例 #29

0

ファイルを表示

ファイル: ZingMP3Parser.py プロジェクト: vantam1990/OnlineMusicDownloader

    def __init__(self, url):
        """Returns new Sequence object with specified url

        url: link to mp3.zing.vn web page
        """
        HTMLParser.__init__(self)
        self.song_name = []
        self.song_artist = []
        self.song_link = []
        self.song_type = []
        req = urlopen(url)  # open connection to web page
        data = None
        if req.info().get('Content-Encoding') == "gzip":
            buf = StringIO( req.read())
            f = gzip.GzipFile(fileobj=buf)
            data = f.read().split("\n")
        else:
            data = req.read().split("\n")  # split web page with \n
        feed_data = None
        for param in data:
            if (param.find('<param name="flashvars" value="') > -1):
                """Find line to get xml url
                """
                feed_data = param
                break
        self.feed(feed_data)  # parser html data

コード例 #30

0

ファイルを表示

ファイル: magic.py プロジェクト: stretchhog/prfit

def insert_to(project_url, destination, find_what, indent=0):
	url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/')
	response = urllib2.urlopen(url)
	if response.getcode() == 200:
		with open(destination, 'r') as dest:
			dest_contents = dest.readlines()
			lines = ''.join(dest_contents)
			content = HTMLParser().unescape(response.read())
			if content.replace(' ', '') in lines.replace(' ', ''):
				print_out('IGNORED', destination)
				return

		generated = []
		for line in dest_contents:
			generated.append(line)
			if line.lower().find(find_what.lower()) >= 0:
				spaces = len(line) - len(line.lstrip())
				for l in content.split('\n'):
					if l:
						generated.append('%s%s\n' % (' ' * (spaces + indent), l))

		with open(destination, 'w') as dest:
			for line in generated:
				dest.write(line)
			print_out('INSERT', destination)

コード例 #31

0

ファイルを表示

ファイル: gdp.py プロジェクト: wanghz/qmagicformula

 def __init__(self):
     HTMLParser.__init__(self)
     self.map = {}
     self.map_flag = False
     self.list = []
     self.list_flag = False

コード例 #32

0

ファイルを表示

ファイル: data_retreival.py プロジェクト: pruthvivooka/LibraryManagementSystem

import csv
from HTMLParser import HTMLParser
import io

books = []
authors = {}
book_authors = []
author_id = 1
publishers = {}
book_publishers = []
publisher_id = 1
h = HTMLParser()
with open('books.csv', 'rb') as csvfile:
    #dialect = csv.Sniffer().sniff(csvfile.read())
    #csvfile.seek(0)
    #reader = csv.reader(csvfile, dialect)
    reader = csv.reader(csvfile, delimiter='\t')
    first_line = True
    for row in reader:
        if (first_line):
            first_line = False
            continue
        books.append([
            row[0],
            h.unescape(row[2].decode('utf-8').strip()).encode('utf-8'), row[4],
            row[6]
        ])
        auths = row[3]
        current_book_authors = []
        for auth in auths.split(','):
            auth = h.unescape(auth.decode('utf-8').strip())

コード例 #33

0

ファイルを表示

 def __init__(self):
   HTMLParser.__init__(self)
   self.toc = []
   self.page_title = None
   self._recent_tag = None
   self._current_heading = {}

コード例 #34

0

ファイルを表示

ファイル: getcourses.py プロジェクト: au5ton/scheduler-tamu-edu

	def __init__(self,target="viewcourses"):
		HTMLParser.__init__(self)
		self.target=target
		self.flag=False
		self.payload={}

コード例 #35

0

ファイルを表示

 def __init__(self):
     hp.__init__(self)
     self.links =[]

コード例 #36

0

ファイルを表示

ファイル: parse_judges.py プロジェクト: OCTO8888/LITIGAIT

# -*- coding: utf-8 -*-

import re
from HTMLParser import HTMLParser

from django.utils.html import strip_tags

h = HTMLParser()

# list of words that aren't judge names
NOT_JUDGE = [
    "above",
    "absent",
    "acting",
    "active",
    "adopted",
    "affirm",
    "after",
    "agrees",
    "all",
    "although",
    "and",
    "affirmed",
    "appeals",
    "appellate",
    "argument",
    "argued",
    "arj",
    "ass",
    "assign",
    "assigned",

コード例 #37

0

ファイルを表示

ファイル: client.py プロジェクト: milnomada/phoenix-queryserver

 def __init__(self):
     HTMLParser.__init__(self)
     self.path = []
     self.title = []
     self.message = []

コード例 #38

0

ファイルを表示

ファイル: autotap.py プロジェクト: chriswinn/homebrew-hashibrew

 def __init__(self):
     """Initialize the parser."""
     HTMLParser.__init__(self)
     self.stable_version = Version('0.0.0')
     self.devel_version = Version('0.0.0')

コード例 #39

0

ファイルを表示

    def update_wiki_tracker(self, comment):
        """ Update wiki page of person earning the delta

            Note: comment passed in is the comment awarding the delta,
            parent comment is the one earning the delta
        """
        logging.info("Updating wiki")

        comment_url = comment.permalink
        submission_url = comment.submission.permalink
        submission_title = comment.submission.title
        parent = self.reddit.get_info(thing_id=comment.parent_id)
        parent_author = parent.author.name
        author_flair = str(self.subreddit.get_flair(parent_author))
        author_flair = re.search("(flair_text': u')(\d*)", author_flair)
        flair_count = "0 deltas"
        if author_flair:
            flair_count = author_flair.group(2)
            if flair_count == "1":
                flair_count = "1 delta"
            else:
                flair_count += " deltas"
        if comment.author:
            awarder_name = comment.author.name
        else:
            return  # Skips, in case the parent comment is deleted.
        today = datetime.date.today()

        # try to get wiki page for user, throws exception if page doesn't exist
        try:
            user_wiki_page = self.reddit.get_wiki_page(self.config.subreddit,
                                                       "user/" + parent_author)

            # get old wiki page content as markdown string, and unescaped any
            # previously escaped HTML characters
            old_content = HTMLParser().unescape(user_wiki_page.content_md)

            # Alter how many deltas is in the first line
            try:
                old_content = re.sub("([0-9]+) delta[s]?", flair_count,
                                     old_content)
            except:
                print(
                    "The 'has received' line in the wiki has failed to update."
                )
            # compile regex to search for current link formatting
            # only matches links that are correctly formatted, so will not be
            # broken by malformed or links made by previous versions of DeltaBot
            regex = re.compile(
                "\\* \\[%s\\]\\(%s\\) \\(\d+\\)" %
                (re.escape(submission_title), re.escape(submission_url)))
            # search old page content for link
            old_link = regex.search(old_content)

            # variable for updated wiki content
            new_content = ""

            # old link exists, only increase number of deltas for post
            if old_link:
                # use re.sub to increment number of deltas in link
                new_link = re.sub(
                    "\((\d+)\)",
                    lambda match: "(" + str(int(match.group(1)) + 1) + ")",
                    old_link.group(0))

                # insert link to new delta
                new_link += "\n    1. [Awarded by /u/%s](%s) on %s/%s/%s" % (
                    awarder_name, comment_url + "?context=3", today.month,
                    today.day, today.year)

                #use re.sub to replace old link with new link
                new_content = re.sub(regex, new_link, old_content)

            # no old link, create old link with initial count of 1
            else:
                # create link and format as markdown list item
                # "?context=2" means link shows comment earning the delta and
                # the comment awarding it
                # "(1)" is the number of deltas earned from that comment
                # (1 because this is the first delta the user has earned)
                add_link = "\n\n* [%s](%s) (1)\n    1. [Awarded by /u/%s](%s) on %s/%s/%s" % (
                    submission_title, submission_url, awarder_name, comment_url
                    + "?context=2", today.month, today.day, today.year)

                # get previous content as markdown string and append new content
                new_content = user_wiki_page.content_md + add_link

            # overwrite old content with new content
            self.reddit.edit_wiki_page(self.config.subreddit,
                                       user_wiki_page.page, new_content,
                                       "Updated delta links.")

        # if page doesn't exist, create page with initial content
        except:

            # create header for new wiki page
            initial_text = "/u/%s has received a request point for the following comments:" % parent_author

            # create link and format as markdown list item
            # "?context=2" means link shows comment earning the delta and the comment awarding it
            # "(1)" is the number of deltas earned from that comment
            # (1 because this is the first delta the user has earned)
            add_link = "\n\n* [%s](%s) (1)\n    1. [Awarded by /u/%s](%s) on %s/%s/%s" % (
                submission_title, submission_url, awarder_name,
                comment_url + "?context=2", today.month, today.day, today.year)

            # combine header and link
            full_update = initial_text + add_link

            # write new content to wiki page
            self.reddit.edit_wiki_page(self.config.subreddit,
                                       "user/" + parent_author, full_update,
                                       "Created user's delta links page.")
            """Add new awardee to Delta Tracker wiki page"""

            # get delta tracker wiki page
            delta_tracker_page = self.reddit.get_wiki_page(
                self.config.subreddit, "delta_tracker")

            # retrieve delta tracker page content as markdown string
            delta_tracker_page_body = delta_tracker_page.content_md

            # create link to user's wiki page as markdown list item
            new_link = "\n\n* /u/%s -- [Delta List](/r/%s/wiki/%s)" % (
                parent_author, self.config.subreddit, parent_author)

            # append new link to old content
            new_content = delta_tracker_page_body + new_link

            # overwrite old page content with new page content
            self.reddit.edit_wiki_page(self.config.subreddit, "delta_tracker",
                                       new_content, "Updated tracker page.")

コード例 #40

0

ファイルを表示

 def run(self):
     self.progressbar_show.emit(True)
     self.info_label.emit(
         translate("AddonsInstaller", "Retrieving description..."))
     if len(self.macros[self.idx]) > 2:
         desc = self.macros[self.idx][2]
         url = self.macros[self.idx][4]
     else:
         mac = self.macros[self.idx][0].replace(" ", "_")
         mac = mac.replace("&", "%26")
         mac = mac.replace("+", "%2B")
         url = "https://www.freecadweb.org/wiki/Macro_" + mac
         self.info_label.emit("Retrieving info from " + str(url))
         if ctx:
             u = urllib2.urlopen(url, context=ctx)
         else:
             u = urllib2.urlopen(url)
         p = u.read()
         if sys.version_info.major >= 3 and isinstance(p, bytes):
             p = p.decode("utf-8")
         u.close()
         code = re.findall("<pre>(.*?)<\/pre>", p.replace("\n", "--endl--"))
         if code:
             # code = code[0]
             # take the biggest code block
             code = sorted(code, key=len)[-1]
             code = code.replace("--endl--", "\n")
         else:
             self.info_label.emit(
                 translate("AddonsInstaller",
                           "Unable to fetch the code of this macro."))
             self.progressbar_show.emit(False)
             self.stop = True
             return
         desc = re.findall(
             "<td class=\"ctEven left macro-description\">(.*?)<\/td>",
             p.replace("\n", " "))
         if desc:
             desc = desc[0]
         else:
             self.info_label.emit(
                 translate(
                     "AddonsInstaller",
                     "Unable to retrieve a description for this macro."))
             desc = "No description available"
         # clean HTML escape codes
         try:
             from HTMLParser import HTMLParser
         except ImportError:
             from html.parser import HTMLParser
         try:
             code = code.decode("utf8")
             code = HTMLParser().unescape(code)
             code = code.encode("utf8")
             code = code.replace("\xc2\xa0", " ")
         except:
             FreeCAD.Console.PrintWarning(
                 translate("AddonsInstaller",
                           "Unable to clean macro code: ") + mac + "\n")
         self.update_macro.emit(self.idx,
                                self.macros[self.idx] + [desc, code, url])
     if self.macros[self.idx][1] == 1:
         message = "<strong>" + translate(
             "AddonsInstaller", "<strong>This addon is already installed."
         ) + "</strong><br>" + desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>'
     else:
         message = desc + ' - <a href="' + url + '"><span style="word-wrap: break-word;width:15em;text-decoration: underline; color:#0000ff;">' + url + '</span></a>'
     self.info_label.emit(message)
     self.progressbar_show.emit(False)
     self.stop = True

コード例 #41

0

ファイルを表示

def decode_html_entities(df):
    print('Decoding HTML entities...')
    h = HTMLParser()
    df['body'] = df['body'].apply(lambda row: h.unescape(row))
    return df

コード例 #42

0

ファイルを表示

ファイル: tools.py プロジェクト: sin1-nstest/SoftwareDevelopment

 def __init__(self, sanitizationLevel=_defaultSanitizationLevel):
     HTMLParser.__init__(self)
     if sanitizationLevel not in range(0, 3):
         sanitizationLevel = self._defaultSanitizationLevel
     self._sanitizationLevel = sanitizationLevel

コード例 #43

0

ファイルを表示

ファイル: proxy2.py プロジェクト: 4lph4/autoSqlmap-1

    def print_info(self, req, req_body, res, res_body):
        def parse_qsl(s):
            return '\n'.join(
                "%-20s %s" % (k, v)
                for k, v in urlparse.parse_qsl(s, keep_blank_values=True))

        req_header_text = "%s %s %s\n%s" % (req.command, req.path,
                                            req.request_version, req.headers)
        res_header_text = "%s %d %s\n%s" % (res.response_version, res.status,
                                            res.reason, res.headers)

        print with_color(33, req_header_text)

        u = urlparse.urlsplit(req.path)
        if u.query:
            query_text = parse_qsl(u.query)
            print with_color(32,
                             "==== QUERY PARAMETERS ====\n%s\n" % query_text)

        cookie = req.headers.get('Cookie', '')
        if cookie:
            cookie = parse_qsl(re.sub(r';\s*', '&', cookie))
            print with_color(32, "==== COOKIE ====\n%s\n" % cookie)

        auth = req.headers.get('Authorization', '')
        if auth.lower().startswith('basic'):
            token = auth.split()[1].decode('base64')
            print with_color(31, "==== BASIC AUTH ====\n%s\n" % token)

        if req_body is not None:
            req_body_text = None
            content_type = req.headers.get('Content-Type', '')

            if content_type.startswith('application/x-www-form-urlencoded'):
                req_body_text = parse_qsl(req_body)
            elif content_type.startswith('application/json'):
                try:
                    json_obj = json.loads(req_body)
                    json_str = json.dumps(json_obj, indent=2)
                    if json_str.count('\n') < 50:
                        req_body_text = json_str
                    else:
                        lines = json_str.splitlines()
                        req_body_text = "%s\n(%d lines)" % ('\n'.join(
                            lines[:50]), len(lines))
                except ValueError:
                    req_body_text = req_body
            elif len(req_body) < 1024:
                req_body_text = req_body

            if req_body_text:
                print with_color(
                    32, "==== REQUEST BODY ====\n%s\n" % req_body_text)

        print with_color(36, res_header_text)

        cookie = res.headers.get('Set-Cookie', '')
        if cookie:
            cookie = parse_qsl(re.sub(r';\s*', '&', cookie))
            print with_color(31, "==== SET-COOKIE ====\n%s\n" % cookie)

        if res_body is not None:
            res_body_text = None
            content_type = res.headers.get('Content-Type', '')

            if content_type.startswith('application/json'):
                try:
                    json_obj = json.loads(res_body)
                    json_str = json.dumps(json_obj, indent=2)
                    if json_str.count('\n') < 50:
                        res_body_text = json_str
                    else:
                        lines = json_str.splitlines()
                        res_body_text = "%s\n(%d lines)" % ('\n'.join(
                            lines[:50]), len(lines))
                except ValueError:
                    res_body_text = res_body
            elif content_type.startswith('text/html'):
                m = re.search(r'<title[^>]*>([\s\S]+?)</title>', res_body,
                              re.I)
                if m:
                    h = HTMLParser()
                    print with_color(
                        32, "==== HTML TITLE ====\n%s\n" %
                        h.unescape(m.group(1).decode('utf-8')))
            elif content_type.startswith('text/') and len(res_body) < 1024:
                res_body_text = res_body

            if res_body_text:
                print with_color(
                    32, "==== RESPONSE BODY ====\n%s\n" % res_body_text)

コード例 #44

0

ファイルを表示

ファイル: page.py プロジェクト: slaclab/ConfluenceHWDB

 def __init__(self):
     HTMLParser.__init__(self)
     self.pageId    = None
     self.pageTitle = None
     self.shortURL  = None
     self.dest      = None

コード例 #45

0

ファイルを表示

    def extract_sentences(self, mode="split", source="fulltext"):
        '''
        Finds sentence boundaries and saves them as sentence objects in the
        attribute "sentences" as a list of Sentence objects.

        Parameters
        ----------
        mode : str, optional, default = "split"
            Split the sentences ("split") or use the whole "source" as a single sentence ("no-split").
            Useful for developing and debugging.

        source : str, optional, default = "fulltext"
            Use the "fulltext" or the "abstract" to extract sentences.
        '''
        text = ""
        if source == "fulltext":
            text = str(self.fulltext)
        else:
            text = str(self.abstract)

        if mode == "no-split":
            # Don't try to separate the sentence.
            # Everything in the text is just one sentence!
            self.sentences.append(Sentence(originaltext=text))
        else:
            caps = "([A-Z])"
            prefixes = "(Mr|Fig|fig|St|Mrs|Ms|Dr)[.]"
            digits = "([0-9])"
            fig_letters = "([A-Ka-k])"
            suffixes = "(Inc|Ltd|Jr|Sr|Co)"
            starters = r"(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
            acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
            websites = "[.](com|net|org|io|gov)"
            species = r"([A-Z])[.] ?([a-z]+)"
            text = " " + text + "  "
            text = text.replace("\n", " ")
            text = re.sub(prefixes, "\\1<prd>", text)
            text = re.sub(websites, "<prd>\\1", text)
            if "Ph.D" in text:
                text = text.replace("Ph.D.", "Ph<prd>D<prd>")
            text = re.sub(r"\s" + caps + "[.] ", " \\1<prd> ", text)
            text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text)
            text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]",
                          "\\1<prd>\\2<prd>\\3<prd>", text)
            text = re.sub(caps + "[.]" + caps + "[.]", "\\1<prd>\\2<prd>",
                          text)
            text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2",
                          text)
            text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text)
            text = re.sub(" " + caps + "[.]", " \\1<prd>", text)
            text = re.sub(digits + caps + "[.]", " \\1<prd>", text)
            text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
            text = re.sub(digits + "[.]" + fig_letters, "\\1<prd>\\2", text)
            text = re.sub(species, "\\1<prd> \\2", text)
            if "”" in text:
                text = text.replace(".”", "”.")
            if "\"" in text:
                text = text.replace(".\"", "\".")
            if "!" in text:
                text = text.replace("!\"", "\"!")
            if "?" in text:
                text = text.replace("?\"", "\"?")
            if "e.g." in text:
                text = text.replace("e.g.", "e<prd>g<prd>")
            if "i.e." in text:
                text = text.replace("i.e.", "i<prd>e<prd>")
            text = text.replace(".", ".<stop>")
            text = text.replace("?", "?<stop>")
            text = text.replace("!", "!<stop>")
            text = text.replace("<prd>", ".")
            sentences = text.split("<stop>")
            #sentences = sentences[:-1]
            sentences = [s.strip() for s in sentences]
            h = HTMLParser()
            for sentence in sentences:
                sentence = str(h.unescape(sentence))
                if not sentence.strip() or not isinstance(sentence, str):
                    continue
                self.sentences.append(Sentence(originaltext=sentence))

コード例 #46

0

ファイルを表示

ファイル: deHtmlParser.py プロジェクト: imclab/Factoid-Question-Answering

 def __init__(self):
     HTMLParser.__init__(self)
     self.__text = []

コード例 #47

0

ファイルを表示

ファイル: parser.py プロジェクト: race604/testsj

 def __init__(self):
     HTMLParser.__init__(self)
     self.item = Commodity()
     self.state = 0

コード例 #48

0

ファイルを表示

ファイル: default.py プロジェクト: leonnieuwoudt/plugin.video.dailymotion_com

def strip_tags(html):
    parser = HTMLParser()
    html = parser.unescape(html)
    s = MLStripper()
    s.feed(html)
    return s.get_data()

コード例 #49

0

ファイルを表示

            return default, [first]

        encoding = find_cookie(second)
        if encoding:
            return encoding, [first, second]

        return default, [first, second]


# For converting & <-> &amp; etc.
try:
    from html import escape
except ImportError:
    from cgi import escape
if sys.version_info[:2] < (3, 4):
    unescape = HTMLParser().unescape
else:
    from html import unescape

try:
    from collections import ChainMap
except ImportError:  # pragma: no cover
    from collections import MutableMapping

    try:
        from reprlib import recursive_repr as _recursive_repr
    except ImportError:

        def _recursive_repr(fillvalue='...'):
            '''
            Decorator to make a repr function return fillvalue for a recursive

コード例 #50

0

ファイルを表示

ファイル: html2word.py プロジェクト: sidyinc/mcafp

 def __init__(self):
     HTMLParser.__init__(self)
     self.word2id = {}
     self.article_id = None
     self.word_pos = None

コード例 #51

0

ファイルを表示

 def __init__(self):
     HTMLParser.__init__(self)
     self.tag_results = {}

コード例 #52

0

ファイルを表示

ファイル: parser.py プロジェクト: race604/testsj

 def reset(self):
     HTMLParser.reset(self)
     self.state = 0

コード例 #53

0

ファイルを表示

ファイル: materials.py プロジェクト: erickzhou/syris

 def __init__(self):
     HTMLParser.__init__(self)
     self.link = None

コード例 #54

0

ファイルを表示

ファイル: samltest.py プロジェクト: matsimon/univention-corporate-server

#!/usr/bin/env python
import re
import requests
from requests_kerberos import HTTPKerberosAuth, OPTIONAL
import subprocess
import socket
import json

import univention.testing.utils as utils
import univention.config_registry as configRegistry

from HTMLParser import HTMLParser
html = HTMLParser()


class SamlError(Exception):
	"""Custom error for everything SAML related"""
	def __init__(self, msg):
		self.message = msg

	def __str__(self):
		return repr(self.message)


class SamlLoginError(SamlError):
	def __init__(self, page):
		self.page = page
		self.message = ''
		self._error_evaluation()

	def _error_evaluation(self):

コード例 #55

0

ファイルを表示

ファイル: HTMLTreeBuilder.py プロジェクト: pombredanne/spacewalk

 def close(self):
     HTMLParser.close(self)
     return self.__builder.close()

コード例 #56

0

ファイルを表示

ファイル: _utils.py プロジェクト: Filnor/ChatOverflow

 def __init__(self):
     HTMLParser.__init__(self)
     self.result = []

コード例 #57

0

ファイルを表示

ファイル: htmllib.py プロジェクト: morningchristina/hello-world

 def __init__(self): 
     self.links = {}    
     f = formatter.NullFormatter()
     HTMLParser.__init__(self, f)

コード例 #58

0

ファイルを表示

ファイル: lib.py プロジェクト: shiok-tech/www

 def __init__(self, strings):
     HTMLParser.__init__(self)
     self.strings = strings
     self.hit_end_tag = False
     self.in_no_split_tag = False
     self.text = ''

コード例 #59

0

ファイルを表示

ファイル: gsearch.py プロジェクト: wh-forker/Google-Alfred3-Workflow

 def __init__(self):
     HTMLParser.__init__(self)
     self.result_info = []
     self.link = ''
     self.title = ''

コード例 #60

0

ファイルを表示

ファイル: HTMLTreeBuilder.py プロジェクト: pombredanne/spacewalk

 def __init__(self):
     self.__stack = []
     self.__builder = ElementTree.TreeBuilder()
     HTMLParser.__init__(self)