Python HTMLParser.split 예제들, HTMLParser.HTMLParser.split Python 예제들

예제 #1

0

파일 보기

파일: magic.py 프로젝트: lipis/the-smallest-creature

def insert_to(project_url, destination, find_what, indent=0):
  url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/')
  response = urllib2.urlopen(url)
  if response.getcode() == 200:
    with open(destination, 'r') as dest:
      dest_contents = dest.readlines()
      lines = ''.join(dest_contents)
      content = HTMLParser().unescape(response.read())
      if content.replace(' ', '') in lines.replace(' ', ''):
        print_out('IGNORED', destination)
        return

    generated = []
    for line in dest_contents:
      generated.append(line)
      if line.lower().find(find_what.lower()) >= 0:
        spaces = len(line) - len(line.lstrip())
        for l in content.split('\n'):
          if l:
            generated.append('%s%s\n' % (' ' * (spaces + indent), l))

    with open(destination, 'w') as dest:
      for line in generated:
        dest.write(line)
      print_out('INSERT', destination)

예제 #2

0

파일 보기

파일: magic.py 프로젝트: stretchhog/prfit

def insert_to(project_url, destination, find_what, indent=0):
	url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/')
	response = urllib2.urlopen(url)
	if response.getcode() == 200:
		with open(destination, 'r') as dest:
			dest_contents = dest.readlines()
			lines = ''.join(dest_contents)
			content = HTMLParser().unescape(response.read())
			if content.replace(' ', '') in lines.replace(' ', ''):
				print_out('IGNORED', destination)
				return

		generated = []
		for line in dest_contents:
			generated.append(line)
			if line.lower().find(find_what.lower()) >= 0:
				spaces = len(line) - len(line.lstrip())
				for l in content.split('\n'):
					if l:
						generated.append('%s%s\n' % (' ' * (spaces + indent), l))

		with open(destination, 'w') as dest:
			for line in generated:
				dest.write(line)
			print_out('INSERT', destination)

예제 #3

0

파일 보기

    def get_pagecount(self):
        u"""Получить количество сниппетов результатов выдачи
        """
        pagecount = 0
        patterns = (
            ur'<div[^>]+resultStats(?:[^>]+)?>Результатов: примерно (.*?)<',
            ur'<div[^>]+resultStats(?:[^>]+)?>(.*?)<nobr>',
            ur'<div[^>]+resultStats(?:[^>]+)?>Результатов:(.*?)</div>',
            ur'<div>Результатов:\s*(.*?)</div>',
            ur'Результатов:\s*(.*?),.*?</div>', ur'из примерно <b>(.*?)</b>',
            ur'<div>Результаты:.*?из\s*<b>\s*(\d+)\s*</b>')

        response = self.content
        for pattern in patterns:
            res = re.findall(
                pattern, response,
                re.DOTALL | re.IGNORECASE | re.UNICODE | re.MULTILINE)
            if res:
                # html-символы могут встречаться в виде своих числовых кодов (например, &nbsp; = &#160;)
                # избавимся от закодированных последовательностей в результирующей строке
                res_str = HTMLParser().unescape(res[0])
                results_ints = re.findall(r'\d+', res_str.split(',')[0])
                if results_ints:
                    return int(''.join(results_ints))
        return pagecount

예제 #4

0

파일 보기

파일: source.py 프로젝트: bosslsk/House

 def parse_detail(self, response):
     try:
         detail_json = json.loads(response.body)
     except ValueError:
         self.logger.error('[NO JSON]' + response.url)
         with open('no_json_decoded.log', 'w') as f:
             f.write(response.body)
         return
     if 'code' in detail_json:
         return
     clock = int(detail_json['islock'])
     if clock:
         self.logger.debug('[THIS BOOK IS CLOCK] ' + response.url)
         return
     item = MaterialSourceItem()
     item.update(response.meta['item'])
     item['relate_id'] = '%s_%s' % (item['source_id'], item['book_id'])
     item['url'] = 'http://www.jjwxc.net/onebook.php?novelid={}'.format(
         item['book_id'])
     item['folder_url'] = detail_json['novelCover']
     item['title'] = detail_json['novelName']
     item['author'] = detail_json['authorName']
     item['gender'] = u'女性向小说'
     introduction = HTMLParser().unescape(detail_json['novelIntro'])
     item['introduction'] = '\n'.join(p.strip()
                                      for p in introduction.split('<br/>')
                                      if p != '')
     item['created_at'] = now_date()
     item['updated_at'] = today_date()
     yield item

예제 #5

0

파일 보기

파일: MetrolyricsParser.py 프로젝트: TQ-tranquility/iTunes-Lyrics

 def parse(self):
     # remove punctuation from artist/title
     clean_artist = Util.remove_punctuation(self.artist)
     clean_title = Util.remove_punctuation(self.title)
         
     # create lyrics Url
     url = "http://www.metrolyrics.com/" + clean_title.replace(" ", "-") + "-lyrics-" + clean_artist.replace(" ", "-") + ".html"
     try:
         resp = urllib2.urlopen(url, None, 3).read()
     except:
         return ""
     
     # verify title
     title = resp
     start = title.find("<title>")
     if start == -1:
         return ""
     title = title[(start+7):]
     end = title.find(" LYRICS</title>")
     if end == -1:
         return ""
     title = title[:end]
     title = HTMLParser().unescape(title)
     songdata = title.split(" - ")
     try:
         if self.artist != songdata[0].lower() or self.title != songdata[1].lower():
             print "wrong artist/title! " + songdata[0].lower() + " - " + songdata[1].lower()
             return ""
     except:
         return ""
     
     self.lyrics = self.get_lyrics(resp)
     self.lyrics = string.capwords(self.lyrics, "\n").strip()
     
     return self.lyrics

예제 #6

0

파일 보기

파일: __init__.py 프로젝트: servo-automation/highfive

    def on_new_comment(self):
        if self.api.sender != self.config["bors_name"]:
            return

        comment_patterns = self.config.get("failure_comment_patterns", [])
        if not any(re.search(pat, self.api.comment) for pat in comment_patterns):
            return

        url = re.findall(r'.*\((.*)\)', self.api.comment)
        if not url:
            return

        # Substitute and get the new url
        # (e.g. http://build.servo.org/json/builders/linux2/builds/2627)
        json_url = re.sub(r'(.*)(builders/.*)', r'\1json/\2', url[0])
        json_stuff = self.api.get_page_content(json_url)
        if not json_stuff:
            return

        build_stats = json.loads(json_stuff)
        failure_regex = r'Tests with unexpected results:\n(.*)\n</span><span'
        comments = []

        for step in build_stats['steps']:
            for name, log_url in step['logs']:
                if name != 'stdio':
                    continue

                stdio = self.api.get_page_content(log_url)
                failures = re.findall(failure_regex, stdio, re.DOTALL)

                if not failures:
                    continue

                try:
                    failures = HTMLParser().unescape(failures[0])
                except UnicodeDecodeError:
                    failures = HTMLParser().unescape(failures[0].decode('utf-8'))

                if 'css' in failures:
                    self._check_css_failures(url)

                comment = [' ' * 4 + line for line in failures.split('\n')]
                comments.extend(comment)

        if comments:
            self.api.post_comment('\n'.join(comments))

예제 #7

0

파일 보기

파일: MetrolyricsParser.py 프로젝트: guilhermeturri/lLyrics

 def parse(self):
     # remove punctuation from artist/title
     clean_artist = self.artist
     clean_title = self.title
     for c in string.punctuation:
         clean_artist = clean_artist.replace(c, "")
         clean_title = clean_title.replace(c, "")
         
     # create lyrics Url
     url = "http://www.metrolyrics.com/" + clean_title.replace(" ", "-") + "-lyrics-" + clean_artist.replace(" ", "-") + ".html"
     print "metrolyrics Url " + url
     try:
         resp = urllib2.urlopen(url, None, 3).read()
     except:
         print "could not connect to metrolyrics.com"
         return ""
     
     # verify title
     title = resp
     start = title.find("<title>")
     if start == -1:
         print "no title found"
         return ""
     title = title[(start+7):]
     end = title.find(" LYRICS</title>")
     if end == -1:
         print "no title end found"
         return ""
     title = title[:end]
     title = HTMLParser().unescape(title)
     songdata = title.split(" - ")
     try:
         if self.artist != songdata[0].lower() or self.title != songdata[1].lower():
             print "wrong artist/title! " + songdata[0].lower() + " - " + songdata[1].lower()
             return ""
     except:
         print "incomplete artist/title"
         return ""
     
     self.lyrics = self.get_lyrics(resp)
     self.lyrics = string.capwords(self.lyrics, "\n").strip()
     
     return self.lyrics

예제 #8

0

파일 보기

파일: __init__.py 프로젝트: CODECOMMUNITY/highfive

def check_failure_log(api, bors_comment):
    # bors_comment would be something like,
    # ":broken_heart: Test failed - [linux2](http://build.servo.org/builders/linux2/builds/2627)"  # noqa
    # ... from which we get the relevant build result url
    url = iter(re.findall(r'.*\((.*)\)', str(bors_comment))).next()
    if not url:
        return

    # Substitute and get the new url
    # (e.g. http://build.servo.org/json/builders/linux2/builds/2627)
    json_url = re.sub(r'(.*)(builders/.*)', r'\1json/\2', url)
    json_stuff = api.get_page_content(json_url)
    if not json_stuff:
        return

    build_stats = json.loads(json_stuff)

    build_log = []
    for step in build_stats['steps']:
        if 'failed' in step['text']:
            build_log = step['logs']
            break

    failed_url = None
    for (name, log_url) in build_log:
        if name == 'stdio':
            failed_url = log_url
            break

    if not failed_url:
        return

    stdio = api.get_page_content(failed_url)
    failure_regex = r'.*Tests with unexpected results:\n(.*)\n</span><span'
    failures = iter(re.findall(failure_regex, stdio, re.DOTALL)).next()
    failures = HTMLParser().unescape(failures)

    if failures:
        comments = [' ' * 4 + line for line in failures.split('\n')]
        api.post_comment('\n'.join(comments))

예제 #9

0

파일 보기

파일: __init__.py 프로젝트: rillian/highfive

def check_failure_log(api, bors_comment):
    # bors_comment would be something like,
    # ":broken_heart: Test failed - [linux2](http://build.servo.org/builders/linux2/builds/2627)"  # noqa
    # ... from which we get the relevant build result url
    url = iter(re.findall(r'.*\((.*)\)', str(bors_comment))).next()
    if not url:
        return

    # Substitute and get the new url
    # (e.g. http://build.servo.org/json/builders/linux2/builds/2627)
    json_url = re.sub(r'(.*)(builders/.*)', r'\1json/\2', url)
    json_stuff = api.get_page_content(json_url)
    if not json_stuff:
        return

    build_stats = json.loads(json_stuff)

    build_log = []
    for step in build_stats['steps']:
        if 'failed' in step['text']:
            build_log = step['logs']
            break

    failed_url = None
    for (name, log_url) in build_log:
        if name == 'stdio':
            failed_url = log_url
            break

    if not failed_url:
        return

    stdio = api.get_page_content(failed_url)
    failure_regex = r'.*Tests with unexpected results:\n(.*)\n</span><span'
    failures = iter(re.findall(failure_regex, stdio, re.DOTALL)).next()
    failures = HTMLParser().unescape(failures)

    if failures:
        comments = [' ' * 4 + line for line in failures.split('\n')]
        api.post_comment('\n'.join(comments))

예제 #10

0

파일 보기

파일: index.py 프로젝트: bopopescu/heretofore

 def parse_detail(self, response):
     try:
         detail_json = json.loads(response.body)
     except ValueError:
         self.logger.error('[NO JSON]' + response.url)
         with open('no_json_decoded.log', 'w') as f:
             f.write(response.body)
         return
     if 'code' in detail_json:
         return
     clock = int(detail_json['islock'])
     if clock:
         self.logger.debug('[THIS BOOK IS CLOCK] ' + response.url)
         return
     sign = int(detail_json['isSign'])
     if sign == 0:
         self.logger.debug('[NOT SIGN] ' + response.url)
         return
     item = BookListItem()
     item['published_at'] = response.meta['published_at']
     item['book_id'] = response.meta['book_id']
     item['source_id'] = 7
     item['url'] = 'http://www.jjwxc.net/onebook.php?novelid={}'.format(item['book_id'])
     item['folder_url'] = detail_json['novelCover']
     item['title'] = detail_json['novelName']
     item['author'] = detail_json['authorName']
     item['author_id'] = detail_json['authorId']
     if detail_json['novelClass'] == '':
         item['category'] = ''
         item['sub_category'] = ''
     else:
         item['category'] = detail_json['novelClass']
         item['sub_category'] = ''  # detail_json['novelClass'].split('-')[-1]
     introduction = HTMLParser().unescape(detail_json['novelIntro'])
     item['introduction'] = '\n'.join(p.strip() for p in introduction.split('<br/>') if p != '')
     item['status'] = 1
     item['created_at'] = self.today
     item['updated_at'] = self.today
     yield item

예제 #11

0

파일 보기

파일: google.py 프로젝트: Dariloff/google-parser

    def get_pagecount(self):
        u"""Получить количество сниппетов результатов выдачи
        """
        pagecount = 0
        patterns = (ur'<div[^>]+resultStats(?:[^>]+)?>Результатов: примерно (.*?)<',
        ur'<div[^>]+resultStats(?:[^>]+)?>(.*?)<nobr>',
        ur'<div[^>]+resultStats(?:[^>]+)?>Результатов:(.*?)</div>',
        ur'<div>Результатов:\s*(.*?)</div>',
        ur'Результатов:\s*(.*?),.*?</div>',
        ur'из примерно <b>(.*?)</b>',
        ur'<div>Результаты:.*?из\s*<b>\s*(\d+)\s*</b>')

        response = self.content
        for pattern in patterns:
            res = re.findall(pattern, response, re.DOTALL | re.IGNORECASE | re.UNICODE | re.MULTILINE)
            if res:
                # html-символы могут встречаться в виде своих числовых кодов (например, &nbsp; = &#160;)
                # избавимся от закодированных последовательностей в результирующей строке
                res_str = HTMLParser().unescape(res[0])
                results_ints = re.findall(r'\d+', res_str.split(',')[0])
                if results_ints:
                    return int(''.join(results_ints))
        return pagecount

예제 #12

0

파일 보기

파일: login.py 프로젝트: cadesalaberry/minervashadow

def welcomemsg(html):
	query = urlparse.parse_qs(html.geturl())
	messg = HTMLParser().unescape(query['msg'][0])
	messg = messg.replace('<b>', '').replace('WELCOME ', '')
	return messg.split('</b>')

예제 #13

0

파일 보기

    [i.extract() for i in c]
    s = soup.findAll('style')
    [i.extract() for i in s]
    try:
        texts = ''.join(soup.findAll(text=True))
        texts = string.replace(texts, '\r', '\n')
        from HTMLParser import HTMLParser
        texts = HTMLParser().unescape(texts)
    except Exception, err:
        _logger.error(
            'BeautifulSoup created but it failed to process doc, url: ' +
            doc['url'] + '\n' + traceback.format_exc())
        return []
    else:
        return [
            re.sub(r'\s+', ' ', text.strip()) for text in texts.split('\n\n')
        ]


def interpret(inpath, outpath):
    with open(inpath, "rb") as crawled_docs:
        docs = cPickle.load(crawled_docs)
    _logger.info('found ' + str(len(docs)) + ' docs from crawler\'s output')

    output_str = u''
    for doc in docs:
        _logger.info('processing doc from url: ' + doc['url'])
        contents = parse_html(doc)
        output_str += unicode(doc['url'] + '\n\n' + '+' * 100 + '\n\n')
        for paragraph in contents:
            if is_valid_text(paragraph):

예제 #14

0

파일 보기

파일: artist.py 프로젝트: Allifreyr/plexpy

class Artist(object):
    """
    This class represents an Artist. It is created knowing only its ID. To reduce API accesses, load information using
    Artist.update_data() only as needed.
    """
    def __init__(self, id, parent_api):
        self.id = id
        self.parent_api = parent_api
        self.name = None
        self.notifications_enabled = None
        self.has_bookmarked = None
        self.image = None
        self.body = None
        self.vanity_house = None
        self.tags = []
        self.similar_artists_and_score = {}
        self.statistics = None
        self.torrent_groups = []
        self.requests = []

        self.parent_api.cached_artists[self.id] = self # add self to cache of known Artist objects

    def update_data(self):
        if self.id > 0:
            response = self.parent_api.request(action='artist', id=self.id)
        elif self.name:
            self.name = HTMLParser().unescape(self.name)
            try:
                response = self.parent_api.request(action='artist', artistname=self.name)
            except Exception:
                self.name = self.name.split(" & ")[0]
                response = self.parent_api.request(action='artist', artistname=self.name)
        else:
            raise InvalidArtistException("Neither ID or Artist Name is valid, can't update data.")
        self.set_data(response)

    def set_data(self, artist_json_response):
        if self.id > 0 and self.id != artist_json_response['id']:
            raise InvalidArtistException("Tried to update an artists's information from an 'artist' API call with a different id." +
                               " Should be %s, got %s" % (self.id, artist_json_response['id']) )
        elif self.name:
            self.id = artist_json_response['id']
            self.parent_api.cached_artists[self.id] = self

        self.name = HTMLParser().unescape(artist_json_response['name'])
        self.notifications_enabled = artist_json_response['notificationsEnabled']
        self.has_bookmarked = artist_json_response['hasBookmarked']
        self.image = artist_json_response['image']
        self.body = artist_json_response['body']
        self.vanity_house = artist_json_response['vanityHouse']

        self.tags = []
        for tag_dict in artist_json_response['tags']:
            tag = self.parent_api.get_tag(tag_dict['name'])
            tag.set_artist_count(self, tag_dict['count'])
            self.tags.append(tag)

        self.similar_artists_and_score = {}
        for similar_artist_dict in artist_json_response['similarArtists']:
            similar_artist = self.parent_api.get_artist(similar_artist_dict['artistId'])
            similar_artist.name = similar_artist_dict['name']
            self.similar_artists_and_score[similar_artist] = similar_artist_dict['score']

        self.statistics = artist_json_response['statistics']

        self.torrent_groups = []
        for torrent_group_item in artist_json_response['torrentgroup']:
            torrent_group = self.parent_api.get_torrent_group(torrent_group_item['groupId'])
            torrent_group.set_artist_group_data(torrent_group_item)
            self.torrent_groups.append(torrent_group)

        self.requests = []
        for request_json_item in artist_json_response['requests']:
            request = self.parent_api.get_request(request_json_item['requestId'])
            request.set_data(request_json_item)
            self.requests.append(request)

    def __repr__(self):
        return "Artist: %s - ID: %s" % (self.name, self.id)

예제 #15

0

파일 보기

파일: artist.py 프로젝트: yonkyunior/headphones

class Artist(object):
    """
    This class represents an Artist. It is created knowing only its ID. To reduce API accesses, load information using
    Artist.update_data() only as needed.
    """
    def __init__(self, id, parent_api):
        self.id = id
        self.parent_api = parent_api
        self.name = None
        self.notifications_enabled = None
        self.has_bookmarked = None
        self.image = None
        self.body = None
        self.vanity_house = None
        self.tags = []
        self.similar_artists_and_score = {}
        self.statistics = None
        self.torrent_groups = []
        self.requests = []

        self.parent_api.cached_artists[
            self.id] = self  # add self to cache of known Artist objects

    def update_data(self):
        if self.id > 0:
            response = self.parent_api.request(action='artist', id=self.id)
        elif self.name:
            self.name = HTMLParser().unescape(self.name)
            try:
                response = self.parent_api.request(action='artist',
                                                   artistname=self.name)
            except Exception:
                self.name = self.name.split(" & ")[0]
                response = self.parent_api.request(action='artist',
                                                   artistname=self.name)
        else:
            raise InvalidArtistException(
                "Neither ID or Artist Name is valid, can't update data.")
        self.set_data(response)

    def set_data(self, artist_json_response):
        if self.id > 0 and self.id != artist_json_response['id']:
            raise InvalidArtistException(
                "Tried to update an artists's information from an 'artist' API call with a different id."
                + " Should be %s, got %s" %
                (self.id, artist_json_response['id']))
        elif self.name:
            self.id = artist_json_response['id']
            self.parent_api.cached_artists[self.id] = self

        self.name = HTMLParser().unescape(artist_json_response['name'])
        self.notifications_enabled = artist_json_response[
            'notificationsEnabled']
        self.has_bookmarked = artist_json_response['hasBookmarked']
        self.image = artist_json_response['image']
        self.body = artist_json_response['body']
        self.vanity_house = artist_json_response['vanityHouse']

        self.tags = []
        for tag_dict in artist_json_response['tags']:
            tag = self.parent_api.get_tag(tag_dict['name'])
            tag.set_artist_count(self, tag_dict['count'])
            self.tags.append(tag)

        self.similar_artists_and_score = {}
        for similar_artist_dict in artist_json_response['similarArtists']:
            similar_artist = self.parent_api.get_artist(
                similar_artist_dict['artistId'])
            similar_artist.name = similar_artist_dict['name']
            self.similar_artists_and_score[
                similar_artist] = similar_artist_dict['score']

        self.statistics = artist_json_response['statistics']

        self.torrent_groups = []
        for torrent_group_item in artist_json_response['torrentgroup']:
            torrent_group = self.parent_api.get_torrent_group(
                torrent_group_item['groupId'])
            torrent_group.set_artist_group_data(torrent_group_item)
            self.torrent_groups.append(torrent_group)

        self.requests = []
        for request_json_item in artist_json_response['requests']:
            request = self.parent_api.get_request(
                request_json_item['requestId'])
            request.set_data(request_json_item)
            self.requests.append(request)

    def __repr__(self):
        return "Artist: %s - ID: %s" % (self.name, self.id)

예제 #16

0

파일 보기

def welcomemsg(html):
    query = urlparse.parse_qs(html.geturl())
    messg = HTMLParser().unescape(query['msg'][0])
    messg = messg.replace('<b>', '').replace('WELCOME ', '')
    return messg.split('</b>')