Beispiel #1
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.tracks = []
     self.endDate = None
     self.curData = ''
     self.curTrack = {}
     self.recording = None
Beispiel #2
0
 def get_jobs(self):
     try:
         jobs_start_time = time.time()
         h = HTMLParser()
         html = h.unescape(self.browser.page_source).encode('utf-8').decode('ascii', 'ignore')
         soup = BeautifulSoup(html, 'html.parser')
         data = soup.findAll('a', id=lambda x: x and x.startswith('popup'))
         counter = 0
         for a in data:
             if a.has_attr('href'):
                 counter = counter + 1
                 #self.DrawSpinner(counter)
                 try:
                     return_code = self.get_job_info(self.browser, self.base_job_url + a['href'].split('?')[1])
                     if return_code == 1:
                         #In case the error pages starts to come
                         jobs_end_time = time.time()
                         print 'All jobs scraping time =', str(jobs_end_time - jobs_start_time)
                         return
                         
                 except Exception:
                     continue
         jobs_end_time = time.time()
         print 'All jobs scraping time =', str(jobs_end_time - jobs_start_time)
     except Exception as e:
         print 'exception= ', str(e)
         #print 'stacktrace= ', traceback.print_exc()
         print 'Line Number= ' + str(sys.exc_traceback.tb_lineno)
Beispiel #3
0
    def __init__(self, site = None):
        HTMLParser.__init__(self)
        dict.__init__(self, ())
        self.in_form = False
        self.select = None

        if site: self.load(site)
Beispiel #4
0
 def __init__(self):
     """ Constructor; initializes washer """
     HTMLParser.__init__(self)
     self.result = ''
     self.nb = 0
     self.previous_nbs = []
     self.previous_type_lists = []
     self.url = ''
     self.render_unallowed_tags = False
     self.allowed_tag_whitelist = \
             CFG_HTML_BUFFER_ALLOWED_TAG_WHITELIST
     self.allowed_attribute_whitelist = \
             CFG_HTML_BUFFER_ALLOWED_ATTRIBUTE_WHITELIST
     # javascript:
     self.re_js = re.compile( ".*(j|j|J)"\
                             "\s*(a|a|A)"\
                             "\s*(v|v|V)"\
                             "\s*(a|a|A)"\
                             "\s*(s|s|S)"\
                             "\s*(c|c|C)"\
                             "\s*(r|r|R)"\
                             "\s*(i|Ã|I)"\
                             "\s*(p|p|P)"\
                             "\s*(t|p|&#84)"\
                             "\s*(:|:).*", re.IGNORECASE | re.DOTALL)
     # vbscript:
     self.re_vb = re.compile( ".*(v|v|V)"\
                             "\s*(b|b|B)"\
                             "\s*(s|s|S)"\
                             "\s*(c|c|C)"\
                             "\s*(r|r|R)"\
                             "\s*(i|Ã|I)"\
                             "\s*(p|p|P)"\
                             "\s*(t|p|T)"\
                             "\s*(:|:).*", re.IGNORECASE | re.DOTALL)
def wolfplex(options):
    # clean events
    Event.objects.filter(source="wolfplex").delete()

    html_parser = HTMLParser()

    soup = BeautifulSoup(urlopen("http://www.wolfplex.org/wiki/Main_Page").read())

    events = soup.find("div", id="accueil-agenda").dl

    for date_info, event in zip(events('dt'), events('dd')[1::2]):
        if event.span:
            event.span.clear()

        title = html_parser.unescape(event.text)
        base_domain = "http://www.wolfplex.org" if not event.a["href"].startswith("http") else ""
        url = (base_domain + event.a["href"]) if event.a else "http://www.wolfplex.org"
        start = parse(date_info.span["title"])

        if "@" in title:
            title, location = title.split("@", 1)
        else:
            location = None

        Event.objects.create(
            title=title,
            source="wolfplex",
            url=url,
            start=start,
            location=location
        )

        if not options["quiet"]:
            print "Adding %s [%s] (%s)..." % (title.encode("Utf-8"), "wolfplex", location.encode("Utf-8") if location else "")
Beispiel #6
0
	def __init__(self):
		HTMLParser.__init__(self)
		self.subjectList = {}
		self.tagi = 0
		self.tdi = 0
		self.dataFlag = 0
		self.subName = ""
 def __init__(self):
     HTMLParser.__init__(self)
     self.in_records_table = False
     self.record = -1
     self.column = -1
     self.data_row = False
     self.data = []
Beispiel #8
0
def update_event_description(event_id, description, analyst):
    """
    Update event description.

    :param event_id: The ObjectId of the Event to update.
    :type event_id: str
    :param description: The new description.
    :type description: str
    :param analyst: The user updating this Event.
    :type analyst: str
    :returns: dict with keys "success" (boolean) and "message" (str)
    """

    if not description:
        return {'success': False, 'message': "No description to change"}
    event = Event.objects(id=event_id).first()
    if not event:
        return {'success': False, 'message': "No event found"}
    # Have to unescape the submitted data. Use unescape() to escape
    # < and friends. Use urllib2.unquote() to escape %3C and friends.
    h = HTMLParser()
    description = h.unescape(description)
    event.description = description
    try:
        event.save(username=analyst)
        return {'success': True}
    except ValidationError, e:
        return {'success': False, 'message': e}
 def feed(self, data):
   from HTMLParser import HTMLParser
   data_with_br = data.replace("\n", "<br/>")
   HTMLParser.feed(self, data_with_br)
   if len(self.current_line) > 0:
     self.lines.append(self.current_line)
     self.current_line = ''
Beispiel #10
0
    def feed(self, data):
        no_cc = u'no closed captioning available'

        if u'<html' in data.lower():
            raise CaptionReadSyntaxError(u'SAMI File seems to be an HTML file.')
        elif no_cc in data.lower():
            raise CaptionReadSyntaxError(u'SAMI File contains "%s"' % no_cc)

        # try to find style tag in SAMI
        try:
            # prevent BS4 error with huge SAMI files with unclosed tags
            index = data.lower().find(u"</head>")

            self.styles = self._css_parse(
                BeautifulSoup(data[:index]).find(u'style').get_text())
        except AttributeError:
            self.styles = {}

        # fix erroneous italics tags
        data = data.replace(u'<i/>', u'<i>')

        # fix awkward tags found in some SAMIs
        data = data.replace(u';>', u'>')
        try:
            HTMLParser.feed(self, data)
        except HTMLParseError as e:
            raise CaptionReadSyntaxError(e)

        # close any tags that remain in the queue
        while self.queue != deque([]):
            closing_tag = self.queue.pop()
            self.sami += u"</%s>" % closing_tag

        return self.sami, self.styles, self.langs
Beispiel #11
0
 def __init__(self, *a, **kw):
     self.indent = '';
     HTMLParser.__init__(self, *a, **kw)
     self.processed_text = ''
     self.tagtracker = []
     self.error_line = 0
     self.line_number = 1
	def __init__(self):
		HTMLParser.__init__(self)
		self.title = False
		self.updated = False
		self.titlestr = ''
		self.updatedstr = ''
		self.list = []
Beispiel #13
0
 def __init__(self, pdf):
     HTMLParser.__init__(self)
     self.style = {}
     self.pre = False
     self.href = ''
     self.align = ''
     self.page_links = {}
     self.font_list = ("times","courier", "helvetica")
     self.font = None
     self.font_stack = [] 
     self.pdf = pdf
     self.r = self.g = self.b = 0
     self.indent = 0
     self.bullet = []
     self.set_font("times", 12)
     self.font_face = "times"    # initialize font
     self.color = 0              #initialize font color
     self.table = None           # table attributes
     self.table_col_width = None # column (header) widths
     self.table_col_index = None # current column index
     self.td = None              # cell attributes
     self.th = False             # header enabled
     self.tr = None
     self.theader = None           # table header cells
     self.tfooter = None           # table footer cells
     self.thead = None
     self.tfoot = None
     self.theader_out = self.tfooter_out = False
    def __init__(self, url):
        """Returns new Sequence object with specified url

        url: link to mp3.zing.vn web page
        """
        HTMLParser.__init__(self)
        self.song_name = []
        self.song_artist = []
        self.song_link = []
        self.song_type = []
        req = urlopen(url)  # open connection to web page
        data = None
        if req.info().get('Content-Encoding') == "gzip":
            buf = StringIO( req.read())
            f = gzip.GzipFile(fileobj=buf)
            data = f.read().split("\n")
        else:
            data = req.read().split("\n")  # split web page with \n
        feed_data = None
        for param in data:
            if (param.find('<param name="flashvars" value="') > -1):
                """Find line to get xml url
                """
                feed_data = param
                break
        self.feed(feed_data)  # parser html data
Beispiel #15
0
    def __init__(self, tag="a", attr="href", process=None, unique=False):
        HTMLParser.__init__(self)

        self.scan_tag = tag if callable(tag) else lambda t: t == tag
        self.scan_attr = attr if callable(attr) else lambda a: a == attr
        self.process_attr = process if callable(process) else lambda v: v
        self.unique = unique
 def __init__(self):
     #super(formParser, self).__init__()
     HTMLParser.__init__(self)
     self.dict = {}
     self.stack = []
     self.post=""
     pass
Beispiel #17
0
	def __init__(self, feed_data):
		HTMLParser.__init__(self)
		self.courses = tuple()
		self.is_course = False
		self.is_coursename = False
		self.is_homework = False
		self.feed(feed_data)
Beispiel #18
0
def insert_to(project_url, destination, find_what, indent=0):
	url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/')
	response = urllib2.urlopen(url)
	if response.getcode() == 200:
		with open(destination, 'r') as dest:
			dest_contents = dest.readlines()
			lines = ''.join(dest_contents)
			content = HTMLParser().unescape(response.read())
			if content.replace(' ', '') in lines.replace(' ', ''):
				print_out('IGNORED', destination)
				return

		generated = []
		for line in dest_contents:
			generated.append(line)
			if line.lower().find(find_what.lower()) >= 0:
				spaces = len(line) - len(line.lstrip())
				for l in content.split('\n'):
					if l:
						generated.append('%s%s\n' % (' ' * (spaces + indent), l))

		with open(destination, 'w') as dest:
			for line in generated:
				dest.write(line)
			print_out('INSERT', destination)
    def __init__(self, new_path, filename, reference_support_info, host=Host(), convert_test_harness_links=True):
        HTMLParser.__init__(self)

        self._host = host
        self._filesystem = self._host.filesystem
        self._webkit_root = WebKitFinder(self._filesystem).webkit_base()

        self.converted_data = []
        self.converted_properties = []
        self.converted_property_values = []
        self.in_style_tag = False
        self.style_data = []
        self.filename = filename
        self.reference_support_info = reference_support_info

        resources_path = self.path_from_webkit_root('LayoutTests', 'resources')
        resources_relpath = self._filesystem.relpath(resources_path, new_path)
        self.new_test_harness_path = resources_relpath
        self.convert_test_harness_links = convert_test_harness_links

        # These settings might vary between WebKit and Blink
        self._css_property_file = self.path_from_webkit_root('Source', 'WebCore', 'css', 'CSSPropertyNames.in')
        self._css_property_value_file = self.path_from_webkit_root('Source', 'WebCore', 'css', 'CSSValueKeywords.in')

        self.test_harness_re = re.compile('/resources/testharness')

        self.prefixed_properties = self.read_webkit_prefixed_css_property_list(self._css_property_file)
        prop_regex = '([\s{]|^)(' + "|".join(prop.replace('-webkit-', '') for prop in self.prefixed_properties) + ')(\s+:|:)'
        self.prop_re = re.compile(prop_regex)

        self.prefixed_property_values = self.read_webkit_prefixed_css_property_list(self._css_property_value_file)
        prop_value_regex = '(:\s*|^\s*)(' + "|".join(value.replace('-webkit-', '') for value in self.prefixed_property_values) + ')(\s*;|\s*}|\s*$)'
        self.prop_value_re = re.compile(prop_value_regex)
Beispiel #20
0
	def __init__(self):
		HTMLParser.__init__(self)
		self.foundName = False
		self.foundDescription = False
		self.foundPrice = False
		self.foundScore = True
		self.gameInfo = {}
Beispiel #21
0
 def __init__(self, properties):
     HTMLParser.__init__(self)
     self.properties = dict((key, value) for key, value in (prop.split(',')
                            for prop in properties.split(';') if prop.find(',') > -1))
     self.data = []
     self.in_td = 0
     self.tr_name = None
Beispiel #22
0
 def __init__(self, builder=None, encoding=None):
     self.__stack = []
     if builder is None:
         builder = ElementTree.TreeBuilder()
     self.__builder = builder
     self.encoding = encoding or "iso-8859-1"
     HTMLParser.__init__(self)
Beispiel #23
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.url = None
     self.params = {}
     self.in_form = False
     self.form_parsed = False
     self.method = "GET"
 def __init__(self):
     HTMLParser.__init__(self)
     self.in_div = False
     self.in_a = False
     self.pattern = re.compile(r'(.*)\((.*)\)')
     self.tangshi_list = []
     self.current_poem = {}
Beispiel #25
0
    def feed(self, token):
        ttype, tvalue, tstart, tend, tline = token
        self.line = tline

        # Handle whitespace
        (prev_row, prev_col) = self.lastPos
        (cur_row, cur_col) = tstart
        (end_row, end_col) = tend

        assert cur_row >= prev_row, "Unexpected jump in row"
        self.lastPos = (end_row, end_col)

        # are we now on a new line?
        if cur_row > prev_row:
            self._appendRows(cur_row - prev_row)

        # are we on a muliline statement?
        if end_row > cur_row:
            self._appendRows(end_row - cur_row)

        # interpret jumps on the same line as a single space
        if cur_row == prev_row and cur_col > prev_col:
            HTMLParser.feed(self, ' ')

        HTMLParser.feed(self, tvalue)
Beispiel #26
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.glink = False
     self.elink = False
     self.ingroup = []
     self.href = ''
     self.name = ''
Beispiel #27
0
    def __init__(self):
        HTMLParser.__init__(self)
        self.trouve=False
        self.encours=False
        self.reponse={}
        self.correspondance={
            'CVSS Score':'cvss_score',
            'Confidentiality Impact':'confidentialite',
            'Integrity Impact':'integrite',
            'Availability Impact':'disponibilite',
            'Access Complexity':'complexite',
            'Authentication':'authentification',
            'Vulnerability Type(s)':'type',
            'CWE ID':None,
            'Gained Access':'acces_obtention'
        }

        self.reponse={
            'cvss_score':None,
            'confidentialite':None,
            'integrite':None,
            'disponibilite':None,
            'complexite':None,
            'authentification':None,
            'type':None,
            'acces_obtention':None
        }

        self.precedent=None
Beispiel #28
0
def getImageLocation(comicRequest):

    titleString = 'id="ctitle">'
    captionString = 'title="'
    imageString = '//imgs.xkcd.com/comics/'

    response = urllib2.urlopen(parseComicRequest(comicRequest))
    html = response.read()

    titleStart = html.find(titleString) + len(titleString)
    titleEnd = html[titleStart:].find('<') + titleStart
    title = html[titleStart:titleEnd]

    imageAddressStart = html.find(imageString)
    imageAddressEnd = html[imageAddressStart:].find('"') + imageAddressStart
    imageAddress = html[imageAddressStart:imageAddressEnd]

    captionStart = (
        html[imageAddressEnd:].find(captionString) + imageAddressEnd +
        len(captionString)
    )
    captionEnd = html[captionStart:].find('"') + captionStart
    caption = html[captionStart:captionEnd]

    parser = HTMLParser()
    caption = parser.unescape(caption)
    title = parser.unescape(title)

    return '*' + title + "*\nhttp:" + str(imageAddress) + '\n' + caption
Beispiel #29
0
	def __init__(self):
		HTMLParser.__init__(self)
		self.lasttag = None
		self.title = ""
		self.pagedata = StringIO()
		self.links = []
		self.inbody = False
Beispiel #30
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.InRow=0
     self.InEntry=0
     self.table =  []
     self.tmpRow = []
     self.hyperlinks = []
     self.RunNumber = 0
     self.TriggerRates = []
     self.Nevts = []
     self.LiveLumiByLS = []
     self.DeliveredLumiByLS = []
     self.FirstLS = -1
     self.LastLS = -1
     self.AvLiveLumi = []
     self.AvDeliveredLumi = []
     self.AvDeadtime = []
     self.DeadTime = []#grant
     self.L1Prescales=[]
     self.RunPage = ''
     self.RatePage = ''
     self.LumiPage = ''
     self.L1Page=''
     self.L1_LS_Page = ''#grant
     self.PrescaleColumn=[]
     self.PrescaleColumnString = ''