Ejemplo n.º 1
0
    def _issue_to_dict(self, issue):
        parser = HTMLParser()

        if self.use_old_api:
            parser = HTMLParser()
            args = {
                'project': self.project_settings['key'],
                'summary': parser.unescape(issue.summary),
                'description': parser.unescape(issue.description),
                'issuetype': {'name': issue.type},
            }
            args.update(self._get_custom_fields(issue))

            try:
                support_user = models.SupportUser.objects.get(user=issue.caller)
                key = support_user.backend_id or issue.caller.email
            except models.SupportUser.DoesNotExist:
                key = issue.caller.email

            args[self.get_field_id_by_name(self.issue_settings['caller_field'])] = [{
                "name": key,  # will be equal to issue.caller.email for non-support users
                "key": key,
            }]
            return args

        args = {
            'requestFieldValues': {
                'summary': parser.unescape(issue.summary),
                'description': parser.unescape(issue.description)
            }
        }

        support_customer = issue.caller.supportcustomer
        args['requestParticipants'] = [support_customer.backend_id]
        return args
Ejemplo n.º 2
0
 def feed(self, string):
     try:
         HTMLParser.feed(self, string)
     except Exception as e:  # pragma: no cover
         import traceback
         traceback.print_exc()
         self.out.write(string)
Ejemplo n.º 3
0
 def convert_to_colour_list(cls, colours, *args, **kwargs):
     """
     Takes a whole munge of nonsense input, converts it into a list of colours.
     Will split apart comma delimited strings. Will decode HTML chars. Will
     concatenate a mixture of comma strings and items
     """
     colours = copy.deepcopy(colours)  # Ensure we don't bugger up original
     if isinstance(colours, (str, unicode)):
         colours = [colours]  # Listify
     colours.extend(args)
     intermediate_list = []
     # Add in comma delimited stuff
     h = HTMLParser()
     for colour_term in colours:
         if isinstance(colour_term, (str, unicode)):
             colour_term_decoded = h.unescape(
                 colour_term)  # HTML char decode
             colour_terms_list = colour_term_decoded.split(",")
             intermediate_list.extend(colour_terms_list)
         else:
             intermediate_list.append(colour_term)
     # Now sanitise the list again
     output_list = []
     for colour in intermediate_list:
         if isinstance(colour, (str, unicode)):
             colour_clean = colour.strip()
         output_list.append(colour)
     return output_list
Ejemplo n.º 4
0
 def __init__(self, builder=None, encoding=None):
     self.__stack = []
     if builder is None:
         builder = ElementTree.TreeBuilder()
     self.__builder = builder
     self.encoding = encoding or "iso-8859-1"
     HTMLParser.__init__(self)
Ejemplo n.º 5
0
 def _prepare_message(self, message):
     # slack don't process html entities
     html_parser = HTMLParser()
     message = html_parser.unescape(message)
     # slack also don't render html itself
     message = strip_tags(message)
     return message
Ejemplo n.º 6
0
    def __init__(self):
        HTMLParser.__init__(self)
        self.links = []

        self.in_anchor = False
        self.attrs = None
        self.title = ''
Ejemplo n.º 7
0
    def __init__(self, *args, **kwargs):
        if sys.version_info > (3,4):  #pragma: no cover
            HTMLParser.__init__(self, convert_charrefs=False)
        else:  #pragma: no cover
            HTMLParser.__init__(self)

        super(HTMLRewriter, self).__init__(*args, **kwargs)
Ejemplo n.º 8
0
    def _extract_programs(html, channel):
        """ Extract Programs from HTML code """
        parser = HTMLParser()

        # Item regexes
        regex_item = re.compile(
            r'<a[^>]+?href="(?P<path>[^"]+)"[^>]+?>'
            r'.*?<h3 class="poster-teaser__title"><span>(?P<title>[^<]*)</span></h3>.*?'
            r'</a>', re.DOTALL)

        # Extract items
        programs = []
        for item in regex_item.finditer(html):
            path = item.group('path')
            if path.startswith('/video'):
                continue

            title = parser.unescape(item.group('title'))

            # Program
            programs.append(
                Program(
                    path=path.lstrip('/'),
                    channel=channel,
                    title=title,
                ))

        return programs
Ejemplo n.º 9
0
    def get_hot_bills(self):
        """
        Get list of most viewed bills from last week

        @return: list of dicts of the form: {
            'congress': which # congress,
            'number': bill #,
            'title': short text,
        }

        """
        soup = self._get("Most-Viewed+Bills")
        table = soup.find("table", class_="confluenceTable")
        if table:
            to_ret = []
            rows = table.findAll("tr")
            h = HTMLParser()
            for row in rows:
                bills = {}
                columns = row.findAll("td")
                if columns and len(columns) == 3:
                    bill = {}
                    bill['congress'] = re.search(
                        r"\[(\d+)\w+\]", columns[1].contents[1]).groups()[0]
                    bill['congress'] = int(bill['congress'])
                    bill['number'] = columns[1].find("a").contents[0].strip()
                    bill['title'] = h.unescape(
                        re.sub(r"\"", "", columns[2].contents[0]))
                    to_ret.append(bill)
            return to_ret
Ejemplo n.º 10
0
 def feed(self, data):
     data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
     data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
     data = data.replace('&#39;', "'")
     data = data.replace('&#34;', '"')
     HTMLParser.feed(self, data)
     HTMLParser.close(self)
Ejemplo n.º 11
0
 def extended_stats(self, user=None):
     if not user:
         data = self.api.me()
     else:
         if isinstance(user, str):
             data = self.api.get_user('%s' % str(user.replace('@', '')))
         else:
             raise InvalidParameter
     logging.info("[*] Created: %s" % data.created_at)
     logging.info("[*] Description: %s" % data.description)
     logging.info("[*] Last update: %s" % data.status.created_at)
     hashtags = ' '.join(
         [ "#%s" % x['text'] for x in \
          data.status.entities['hashtags']]
     )
     mentions = ' '.join(
         [ "@%s" % x['screen_name'] for x in \
             data.status.entities['user_mentions']]
     )
     logging.info("[*] \tUser Mentions: %s" % mentions)
     logging.info("[*] \tHashtags: %s" % hashtags)
     html = HTMLParser()
     if "RT @" in data.status.text:
         logging.info(
             "[*] \tRetweet Text: %s" %
             html.unescape(data.status.text.replace('\n', '\n\t\t    ')))
     else:
         logging.info(
             "[*] \tTweet Text: %s" %
             html.unescape(data.status.text.replace('\n', '\n\t\t    ')))
     logging.info('[*] \tRetweet Count: %s' %
                  str(data.status.retweet_count))
Ejemplo n.º 12
0
def strip_tags(string, allowed_tags=''):
    if allowed_tags != '':
    # Get a list of all allowed tag names.
        allowed_tags_list = re.sub(r'[\\/<> ]+', '', allowed_tags).split(',')
        allowed_pattern = ''
        for s in allowed_tags_list:
            if s == '':
                continue;
            # Add all possible patterns for this tag to the regex.
            if allowed_pattern != '':
                allowed_pattern += '|'
            allowed_pattern += '<' + s + ' [^><]*>$|<' + s + '>|'
        # Get all tags included in the string.
        all_tags = re.findall(r'<]+>', string, re.I)
        for tag in all_tags:
            # If not allowed, replace it.
            if not re.match(allowed_pattern, tag, re.I):
                string = string.replace(tag, '')
    else:
        # If no allowed tags, remove all.
        string = re.sub(r'<[^>]*?>', '', string)

    h = HTMLParser()
    string = h.unescape(string)
 
    return string
def parse_denied_courses(school_html):
    root = fromstring(school_html)
    denied_table = root.cssselect('#NcaaCrs_DeniedCategory_All')
    courses = []
    for tr in denied_table[0].cssselect('tr')[1:]:
        tables = tr.cssselect('table')
        try:
            subject = tables[0].cssselect('.hs_tableHeader')[0].text_content()
        except IndexError:
            continue

        for course_tr in tables[1].cssselect('tbody tr'):
            course = {}

            tds = course_tr.cssselect('td')

            course['subject'] = subject
            course['course_weight'] = tds[0].text_content().strip()
            h = HTMLParser()
            course['title'] = h.unescape(tds[1].text_content().strip())
            course['notes'] = tds[2].text_content().strip()
            course['max_credits'] = tds[3].text_content().strip()
            course['ok_through'] = tds[4].text_content().strip()
            course['reason_code'] = tds[5].text_content().strip()
            course['disability_course'] = tds[6].text_content().strip()

            courses.append(course)

    return courses
Ejemplo n.º 14
0
def get_link(url):
    if 'apitvh.net' in url \
            or 'tvhayz.net' in url \
            or 'tvhays.org' in url \
            or 'tvhai.org' in url \
            :
        url = re.search(r'\?link=(.*)', url).group(1)

    response = Request().get(url)
    m = re.search('data-options="(.+?)"', response)
    h = HTMLParser()
    try:
        s = m.group(1)
    except:
        raise Exception("Link has been removed")
    s = h.unescape(s)
    s = json.loads(s)
    s = json.loads(s['flashvars']['metadata'])
    items = [(i['url'], rsl(i['name'])) for i in s['videos']]
    items = sorted(items, key=lambda elem: int(elem[1]), reverse=True)

    if len(items) == 1:
        return items[0]

    listitems = []
    for i in items:
        listitems.append("%s (%s)" % (i[1], i[0]))
    index = xbmcgui.Dialog().select("Select ok.ru stream", listitems)
    if index == -1:
        return None, None
    else:
        return items[index]
Ejemplo n.º 15
0
    def __init__(self, *args, **kwargs):
        if sys.version_info > (3, 4):  #pragma: no cover
            HTMLParser.__init__(self, convert_charrefs=False)
        else:  #pragma: no cover
            HTMLParser.__init__(self)

        super(HTMLRewriter, self).__init__(*args, **kwargs)
Ejemplo n.º 16
0
    def _issue_to_dict(self, issue):
        """ Convert issue to dict that can be accepted by JIRA as input parameters """
        caller = issue.caller.full_name or issue.caller.username
        parser = HTMLParser()
        args = {
            'project':
            self.project_settings['key'],
            'summary':
            parser.unescape(issue.summary),
            'description':
            parser.unescape(issue.description),
            'issuetype': {
                'name': issue.type
            },
            self._get_field_id_by_name(self.issue_settings['caller_field']):
            caller,
        }

        if issue.reporter:
            args[self._get_field_id_by_name(
                self.issue_settings['reporter_field'])] = issue.reporter.name
        if issue.impact:
            args[self._get_field_id_by_name(
                self.issue_settings['impact_field'])] = issue.impact
        if issue.priority:
            args['priority'] = {'name': issue.priority}
        return args
Ejemplo n.º 17
0
 def feed(self, string):
     try:
         HTMLParser.feed(self, string)
     except Exception as e:  # pragma: no cover
         import traceback
         traceback.print_exc()
         self.out.write(string)
Ejemplo n.º 18
0
 def __init__(self, encoding='iso8859-1'):
     HTMLParser.__init__(self)
     self.encoding = encoding
     self.tagstack = []
     self.checkflag = 0  # Are we in a tag we check?
     self.inbody = 0
     self.__data = []
Ejemplo n.º 19
0
def display_link_prompt(args, urls, titles):
    """Print URLs and their descriptions alongside a prompt.

    Keyword arguments:
    args -- program arguments (dict)
    urls -- search URLs found (list)
    titles -- descriptions of search URLs found (list)
    """
    while 1:
        print('\n{0}'.format(BORDER))
        for i in range(len(urls)):
            link = HTMLParser().unescape(titles[i])
            print('{0}. {1}'.format(i+1, link.encode('utf-8') if PY2 else link))
        print(BORDER)

        # Handle link prompt input
        try:
            link_input = [inp.strip() for inp in input(': ').split()]
            if not link_input:
                continue
            utils.check_input(link_input)  # Check input in case of quit
            print('\n')
            exec_prompt_cmd(args, urls, link_input[0], link_input[1:])
        except (KeyboardInterrupt, EOFError, ValueError, IndexError):
            return False
Ejemplo n.º 20
0
    def __init__(self, url):
        HTMLParser.__init__(self)

        if url[-1] != '/':
            url += '/'
        self.__url = url
        self.links = set()
Ejemplo n.º 21
0
    def __init__(self, url, session=None, authentication=None, timeout=None):
        """Create instance of a directory parser.

        :param url: url of the directory on the web server.
        :param session: a requests Session instance used to fetch the directory
                        content. If None, a new session will be created.
        :param authentication: a tuple (username, password) to authenticate against
                               the web server, or None for no authentication. Note
                               that it will only be used if the given *session* is
                               None.
        :param timeout: timeout in seconds used when fetching the directory
                        content.
        """
        if not session:
            session = requests.Session()
            session.auth = authentication
        self.session = session
        self.timeout = timeout

        self.active_url = None
        self.entries = []

        HTMLParser.__init__(self)

        # Force the server to not send cached content
        headers = {'Cache-Control': 'max-age=0'}
        r = self.session.get(url, headers=headers, timeout=self.timeout)

        try:
            r.raise_for_status()
            self.feed(r.text)
        finally:
            r.close()
Ejemplo n.º 22
0
    def get_programs(self, channel):
        """ Get a list of all programs of the specified channel.
        :type channel: str
        :rtype list[Program]
        NOTE: This function doesn't use an API.
        """
        if channel not in CHANNELS:
            raise Exception('Unknown channel %s' % channel)

        # Load webpage
        data = self._get_url(CHANNELS[channel]['url'])

        # Parse programs
        h = HTMLParser()
        regex_programs = re.compile(
            r'<a class="program-overview__link" href="(?P<path>[^"]+)">\s+'
            r'<span class="program-overview__title">\s+(?P<title>[^<]+)</span>.*?'
            r'</a>', re.DOTALL)

        programs = [
            Program(channel=channel,
                    path=program.group('path').lstrip('/'),
                    title=h.unescape(program.group('title').strip()))
            for program in regex_programs.finditer(data)
        ]

        return programs
Ejemplo n.º 23
0
def _get_event():
    event = [e for e in session['events'] if e['id'] == request.args.get('event')]
    if event:
        h = HTMLParser()
        event[0]['description'] = h.unescape(event[0]['description'])
        return jsonify(event[0])
    return ''
Ejemplo n.º 24
0
    def __init__(self):
        if is_py3():
            HTMLParser.__init__(self, convert_charrefs=True)
        else:
            HTMLParser.__init__(self)

        self._output = ''
Ejemplo n.º 25
0
    def get_episode(self, channel, path):
        """ Get a Episode object from the specified page.
        :type channel: str
        :type path: str
        :rtype Episode
        NOTE: This function doesn't use an API.
        """
        if channel not in CHANNELS:
            raise Exception('Unknown channel %s' % channel)

        # Load webpage
        page = self._get_url(CHANNELS[channel]['url'] + '/' + path)

        # Extract program JSON
        parser = HTMLParser()
        regex_program = re.compile(r'data-hero="([^"]+)', re.DOTALL)
        json_data = parser.unescape(regex_program.search(page).group(1))
        data = json.loads(json_data)['data']
        program = self._parse_program_data(data)

        # Extract episode JSON
        regex_episode = re.compile(
            r'<script type="application/json" data-drupal-selector="drupal-settings-json">(.*?)</script>',
            re.DOTALL)
        json_data = parser.unescape(regex_episode.search(page).group(1))
        data = json.loads(json_data)

        # Lookup the episode in the program JSON based on the nodeId
        # The episode we just found doesn't contain all information
        for episode in program.episodes:
            if episode.nodeid == data['pageInfo']['nodeId']:
                return episode

        return None
Ejemplo n.º 26
0
 def feed(self, data):
     data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
     data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
     data = data.replace('&#39;', "'")
     data = data.replace('&#34;', '"')
     HTMLParser.feed(self, data)
     HTMLParser.close(self)
Ejemplo n.º 27
0
def cmd_genpot(config, options):
    """Generate the gettext pot file"""

    os.chdir(config.source_dir)

    po_path = os.path.join(config.source_dir, 'po')
    if not os.path.isdir(po_path):
        os.mkdir(po_path)

    python_files = []
    for root, dirs_dummy, files in os.walk(config.source_dir):
        for file_name in files:
            if file_name.endswith('.py'):
                file_path = os.path.relpath(os.path.join(root, file_name),
                                            config.source_dir)
                python_files.append(file_path)
    python_files.sort()

    # First write out a stub .pot file containing just the translated
    # activity name, then have xgettext merge the rest of the
    # translations into that. (We can't just append the activity name
    # to the end of the .pot file afterwards, because that might
    # create a duplicate msgid.)
    pot_file = os.path.join('po', '%s.pot' % config.bundle_name)
    escaped_name = _po_escape(config.activity_name)
    f = open(pot_file, 'w')
    f.write('#: activity/activity.info:2\n')
    f.write('msgid "%s"\n' % escaped_name)
    f.write('msgstr ""\n')
    if config.summary is not None:
        escaped_summary = _po_escape(config.summary)
        f.write('#: activity/activity.info:3\n')
        f.write('msgid "%s"\n' % escaped_summary)
        f.write('msgstr ""\n')

    if config.description is not None:
        parser = HTMLParser()
        strings = []
        parser.handle_data = strings.append
        parser.feed(config.description)

        for s in strings:
            s = s.strip()
            if s:
                f.write('#: activity/activity.info:4\n')
                f.write('msgid "%s"\n' % _po_escape(s))
                f.write('msgstr ""\n')
    f.close()

    args = [
        'xgettext', '--join-existing', '--language=Python', '--keyword=_',
        '--add-comments=TRANS:',
        '--output=%s' % pot_file
    ]

    args += python_files
    retcode = subprocess.call(args)
    if retcode:
        print('ERROR - xgettext failed with return code %i.' % retcode)
Ejemplo n.º 28
0
 def original_unescape(self, s):
     """Since we need to use this sometimes"""
     if isinstance(s, basestring):
         return unicode(HTMLParser.unescape(self, s))
     elif isinstance(s, list):
         return [unicode(HTMLParser.unescape(self, item)) for item in s]
     else:
         return s
Ejemplo n.º 29
0
 def __init__(self, allows=None):
     HTMLParser.__init__(self)
     if allows is None:
         allows = []
     self.allow_tags = allows if allows else self.allow_tags
     self.result = []
     self.start = []
     self.data = []
Ejemplo n.º 30
0
    def __init__(self, style, styles = None):
        HTMLParser.__init__(self)

        self.s = ''
        self.style = style

        self.styles = styles if styles else default_styles
        self.style_stack = []
Ejemplo n.º 31
0
    def __init__(self, styled, styles=None):
        HTMLParser.__init__(self)

        self.s = ''
        self.styled = styled

        self.styles = styles if styles else default_styles
        self.style_stack = []
Ejemplo n.º 32
0
    def __init__(self, bot: "Bot"):
        self.bot = bot
        self.translate_client = translate.Client()  # _http=self.bot.http)
        self.h = HTMLParser()

        self._spam_check = defaultdict(SpamChecker)

        # channel_id: list
        self.chat_history = defaultdict(lambda: [])
Ejemplo n.º 33
0
 def __init__(self):
     HTMLParser.__init__(self)
     self._ignore = False
     self._ignorePath = None
     self._lasttag = None
     self._depth = 0
     self.depthText = {}  # path:text
     self.counting = 0
     self.lastN = 0
Ejemplo n.º 34
0
    def __init__(self, media_locator, link_handler):
        HTMLParser.__init__(self)
        self.handlers_start = StartRules(media_locator, link_handler)
        self.handlers_startend = StartEndRules(media_locator, link_handler)
        self.handlers_end = EndRules()

        self.new_buffer()
        self.stack = deque()
        self.stack.append([])
Ejemplo n.º 35
0
    def __init__(self, max_words):
        # In Python 2, HTMLParser is not a new-style class,
        # hence super() cannot be used.
        HTMLParser.__init__(self)

        self.max_words = max_words
        self.words_found = 0
        self.open_tags = []
        self.truncate_at = None
Ejemplo n.º 36
0
Archivo: utils.py Proyecto: 52M/pelican
 def feed(self, *args, **kwargs):
     try:
         # With Python 2, super() cannot be used.
         # See the comment for __init__().
         HTMLParser.feed(self, *args, **kwargs)
     except self.TruncationCompleted as exc:
         self.truncate_at = exc.truncate_at
     else:
         self.truncate_at = None
Ejemplo n.º 37
0
 def __init__(self, _file, search_tag):
     if six.PY3:
         super(TemplateParser, self).__init__()
     else:
         # HTMLParser is not a new-style class in py2
         HTMLParser.__init__(self)
     self.search_tag = search_tag
     self.file = _file
     self.parsed_data = []
Ejemplo n.º 38
0
 def feed(self, *args, **kwargs):
     try:
         # With Python 2, super() cannot be used.
         # See the comment for __init__().
         HTMLParser.feed(self, *args, **kwargs)
     except self.TruncationCompleted as exc:
         self.truncate_at = exc.truncate_at
     else:
         self.truncate_at = None
Ejemplo n.º 39
0
 def __init__(self):
     HTMLParser.__init__(self)
     self._ignore = False
     self._ignorePath = None
     self._lasttag = None
     self._depth = 0
     self.depthText = {} # path:text
     self.counting = 0
     self.lastN = 0
Ejemplo n.º 40
0
def replaceHTMLCodes(txt):

    txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt)
    txt = HTMLParser().unescape(txt)
    txt = txt.replace("&quot;", "\"")
    txt = txt.replace("&amp;", "&")
    txt = txt.replace("&#38;", "&")
    txt = txt.replace("&nbsp;", "")
    return txt
Ejemplo n.º 41
0
    def __init__(self, max_words):
        # In Python 2, HTMLParser is not a new-style class,
        # hence super() cannot be used.
        HTMLParser.__init__(self)

        self.max_words = max_words
        self.words_found = 0
        self.open_tags = []
        self.truncate_at = None
    def __init__(self):
        HTMLParser.__init__(self)

        self.text_name = None
        self.original_value = None
        self.new_value = None

        self.in_tag = False
        self.read_buffer = six.StringIO()
Ejemplo n.º 43
0
def cmd_genpot(config, options):
    """Generate the gettext pot file"""

    os.chdir(config.source_dir)

    po_path = os.path.join(config.source_dir, 'po')
    if not os.path.isdir(po_path):
        os.mkdir(po_path)

    python_files = []
    for root, dirs_dummy, files in os.walk(config.source_dir):
        for file_name in files:
            if file_name.endswith('.py'):
                file_path = os.path.relpath(os.path.join(root, file_name),
                                            config.source_dir)
                python_files.append(file_path)
    python_files.sort()

    # First write out a stub .pot file containing just the translated
    # activity name, then have xgettext merge the rest of the
    # translations into that. (We can't just append the activity name
    # to the end of the .pot file afterwards, because that might
    # create a duplicate msgid.)
    pot_file = os.path.join('po', '%s.pot' % config.bundle_name)
    escaped_name = _po_escape(config.activity_name)
    f = open(pot_file, 'w')
    f.write('#: activity/activity.info:2\n')
    f.write('msgid "%s"\n' % escaped_name)
    f.write('msgstr ""\n')
    if config.summary is not None:
        escaped_summary = _po_escape(config.summary)
        f.write('#: activity/activity.info:3\n')
        f.write('msgid "%s"\n' % escaped_summary)
        f.write('msgstr ""\n')

    if config.description is not None:
        parser = HTMLParser()
        strings = []
        parser.handle_data = strings.append
        parser.feed(config.description)

        for s in strings:
            s = s.strip()
            if s:
                f.write('#: activity/activity.info:4\n')
                f.write('msgid "%s"\n' % _po_escape(s))
                f.write('msgstr ""\n')
    f.close()

    args = ['xgettext', '--join-existing', '--language=Python',
            '--keyword=_', '--add-comments=TRANS:', '--output=%s' % pot_file]

    args += python_files
    retcode = subprocess.call(args)
    if retcode:
        print('ERROR - xgettext failed with return code %i.' % retcode)
Ejemplo n.º 44
0
 def logged_in(self, y):
     if all([None is y or 'logout' in y,
             bool(filter(lambda c: 'remember_web_' in c, self.session.cookies.keys()))]):
         if None is not y:
             self.shows = dict(re.findall('<option value="(\d+)">(.*?)</option>', y))
             h = HTMLParser()
             for k, v in self.shows.items():
                 self.shows[k] = sanitizeSceneName(h.unescape(unidecode(v.decode('utf-8'))))
         return True
     return False
Ejemplo n.º 45
0
 def add_set(self, title, description, index=-1):
     widget = QtWidgets.QCheckBox(title.replace('&', '&&'))
     if description:
         h = HTMLParser()
         widget.setToolTip(h.unescape(description))
     if index >= 0:
         self.sets_widget.layout().insertWidget(index, widget)
     else:
         self.sets_widget.layout().addWidget(widget)
     return widget
Ejemplo n.º 46
0
    def __init__(self, *args, **kwargs):
        if sys.version_info > (3,):
            super(AnchorParser, self).__init__(*args, **kwargs)
        else:   # pragma: no cover
            # HTMLParser is still an old style object and so super doesn't
            # work
            HTMLParser.__init__(self, *args, **kwargs)

        self.capture = 0
        self.url = ''
        self.text = ''
Ejemplo n.º 47
0
def check_bz_bug(b):
    ''' Return status of a bug in BZ'''
    html = get_html(b)
    if html:
        text = html.content.decode('utf-8')
        name = TITLE.search(text).group(1) if TITLE.search(text) else ''
        h = HTMLParser()
        name = h.unescape(name)
    else:
        name = ''
    return name, None
Ejemplo n.º 48
0
def get_formatted_value(value, field):
	'''Prepare field from raw data'''

	from six.moves.html_parser import HTMLParser

	if(getattr(field, 'fieldtype', None) in ["Text", "Text Editor"]):
		h = HTMLParser()
		value = h.unescape(value)
		value = (re.subn(r'<[\s]*(script|style).*?</\1>(?s)', '', text_type(value))[0])
		value = ' '.join(value.split())
	return field.label + " : " + strip_html_tags(text_type(value))
Ejemplo n.º 49
0
 def get_email_subject(self):
     """
     WARNING: It is MANDATORY to override method if you are going to
     send email using the  `send_notification_email` method.
     Your class must define an `email_subject_tmpl` attribute
     containing a template path to a file that has your email subject.
     """
     # Convert the html back to plaintext after rendering it using template
     # to get rid of html ampersand character codes
     parser = HTMLParser()
     html_email = self._get_email_field('email_subject_tmpl', 'get_email_subject')
     return parser.unescape(html_email)
 def __init__(self):
     HTMLParser.__init__(self)
     self._encoding = 'ISO-8859-1'
     self._handlers = {'table_start' : self.table_start,
                       'table_end'   : self.table_end,
                       'tr_start'    : self.tr_start,
                       'tr_end'      : self.tr_end,
                       'td_start'    : self.td_start,
                       'td_end'      : self.td_end,
                       'th_start'    : self.td_start,
                       'th_end'      : self.td_end,
                       'br_start'    : self.br_start,
                       'meta_start'  : self.meta_start}
Ejemplo n.º 51
0
    def __init__(self, tag="a", attr="href", process=None, unique=False):
        HTMLParser.__init__(self)

        warnings.warn(
            "HtmlParserLinkExtractor is deprecated and will be removed in "
            "future releases. Please use scrapy.linkextractors.LinkExtractor",
            ScrapyDeprecationWarning, stacklevel=2,
        )

        self.scan_tag = tag if callable(tag) else lambda t: t == tag
        self.scan_attr = attr if callable(attr) else lambda a: a == attr
        self.process_attr = process if callable(process) else lambda v: v
        self.unique = unique
Ejemplo n.º 52
0
Archivo: utils.py Proyecto: 52M/pelican
    def __init__(self, max_words):
        # In Python 2, HTMLParser is not a new-style class,
        # hence super() cannot be used.
        try:
            HTMLParser.__init__(self, convert_charrefs=False)
        except TypeError:
            # pre Python 3.3
            HTMLParser.__init__(self)

        self.max_words = max_words
        self.words_found = 0
        self.open_tags = []
        self.last_word_end = None
        self.truncate_at = None
Ejemplo n.º 53
0
def replace_html_entities(xml_bin_str):
    """XML does not contain entity references for many HTML entities, yet the
    Federal Register XML sometimes contains the HTML entities. Replace them
    here, lest we throw off XML parsing"""
    parser = HTMLParser()
    match = HTML_RE.search(xml_bin_str)
    while match:
        match_bin = match.group(0)
        match_str = match_bin.decode('utf-8')
        replacement = parser.unescape(match_str).encode('UTF-8')
        logger.debug("Replacing %s with %s in retrieved XML",
                     match_str, replacement)
        xml_bin_str = xml_bin_str.replace(match_bin, replacement)
        match = HTML_RE.search(xml_bin_str)
    return xml_bin_str
Ejemplo n.º 54
0
 def parse_endtag(self, i):
     # This is necessary because the underlying HTMLParser is buggy and
     # unreliable.
     try:
         return HTMLParser.parse_endtag(self, i)
     except AttributeError:
         return -1
Ejemplo n.º 55
0
    def __init__(self, styled):
        HTMLParser.__init__(self)

        self.s = ''
        self.styled = styled

        self.styles = {
            'err': MyHTMLParser.term.red,
            'ref': MyHTMLParser.term.yellow,
            'rev': MyHTMLParser.term.bold,
            'cmd': MyHTMLParser.term.cyan + self.term.underline,
            # 'sub': term.cyan,
            'echo': MyHTMLParser.term.yellow,
        }

        self.style_stack = []
Ejemplo n.º 56
0
    def _internal_close(self):
        if (self._wb_parse_context):
            end_tag = '</' + self._wb_parse_context + '>'
            self.feed(end_tag)
            self._wb_parse_context = None

        # if haven't insert head_insert, but wrote some content
        # out, then insert head_insert now
        if self.head_insert and self.parsed_any:
            self.out.write(self.head_insert)
            self.head_insert = None

        try:
            HTMLParser.close(self)
        except Exception:  # pragma: no cover
            # only raised in 2.6
            pass
Ejemplo n.º 57
0
class Feed:
    # Class to handle Feeds
    def __init__(self, data, markup):
        self.obj = BeautifulSoup(data, markup)
        self.html_parser = HTMLParser()

    def getFeeds(self):
        # instantiate
        feeds = {}

        # get title
        feeds['title'] = self.getTitle()
        # get link
        feeds['link'] = self.getLink()
        # get items
        feeds['items'] = self.setupItems()

        return feeds

    def getTitle(self):
        return self.obj.title.string

    def getLink(self):
        return self.obj.find('link').string

    def getItems(self):
        return self.obj.find_all('item')

    def setupItems(self):
        items = self.getItems()
        data = []

        for item in items:
            new_item = {
                'title': self.html_parser.unescape( item.title.string ),
                'link': item.find("link").string,
                'comments_link': item.find("comments"),
                'publication_date': item.find('pubDate').text,
                'author': self.html_parser.unescape( item.find('creator').text )
            }
            data.append(new_item)

        return data
Ejemplo n.º 58
0
        def __init__(self, settings, filename):
            try:
                # Python 3.4+
                HTMLParser.__init__(self, convert_charrefs=False)
            except TypeError:
                HTMLParser.__init__(self)
            self.body = ''
            self.metadata = {}
            self.settings = settings

            self._data_buffer = ''

            self._filename = filename

            self._in_top_level = True
            self._in_head = False
            self._in_title = False
            self._in_body = False
            self._in_tags = False
Ejemplo n.º 59
0
 def __init__(self, typogrify, html_doc):
     self.html_doc = html_doc.strip()
     try:
         # Python 3.4+
         HTMLParser.__init__(self, convert_charrefs=False)
     except TypeError:
         HTMLParser.__init__(self)
     
     # Mark the new line positions - needed to
     # determine the position within the input string
     #
     # ACTUALLY - we should use StringIO here instead
     new_line = 1
     self.new_line_pos[new_line] = 0
     for index, char in enumerate(self.html_doc):
         if char == "\n":
             new_line += 1
             # Add one due to index being zero based
             self.new_line_pos[new_line] = index + 1
     
     self.typogrify = typogrify
     self.feed(self.html_doc)  # start parsing
Ejemplo n.º 60
0
 def save(self, page, crawl=False):
     """Requests and saves a remote page to a local file
     """
     print('Saving '+page)
     html = self.fetch(page, crawl)
     content = html.replace('</p>', '\n')
     content = re.sub(r'<.*?>', ' ', content)
     content = HTMLParser().unescape(content)
     content = content.encode('utf8')
     if self.out_file:
         with open(self.out_file, 'a') as handle:
             handle.write(content+'\n')
     elif self.out_dir:
         page_key = self.page_key(page)
         try:
             os.makedirs(os.path.join(self.out_dir, page_key[0]))
         except OSError:
             pass
         filename = os.path.join(self.out_dir, page_key[0],
                                 (page_key[1]+'?'+page_key[2]).replace('/', '_'))
         with open(filename, 'w') as handle:
             handle.write(content)